## Setting Up Functions

Here I set up the functions we will use to parse the files.        

In [None]:
import pandas as pd
import json

# Function for writing output
def write_file(df, fileout):
    gilded = df[df['gilded'] > 0]
    print 'Writing', gilded.shape[0], 'entries to', fileout
    if os.path.isfile(fileout):
        gilded.to_csv(fileout, mode='a', header=False, encoding='utf-8', index=False)
    else:
        gilded.to_csv(fileout, encoding='utf-8', index=False)

# Function for reading file with given parameters
def parse_file(filein, amount, fileout):
    print 'Opening', filein
    with open(filein, 'r') as f:
        itt = 1 # Line counter
        lines = []
        for line in f:
            lines.append(json.loads(line[:-1]))
            itt+=1
            if itt > amount: # If chunk_size is reached, write to csv and clear memory
                if lines:
                    write_file(pd.DataFrame(lines), fileout)
                lines = [] # Clear list (free memory)
                itt = 1 # Reset line counter
        if lines: # Write final lines of file (if any)
            write_file(pd.DataFrame(lines), fileout)
            lines = []
    print 'Complete! All gilded records from', filein, 'written to', fileout            

## Parsing Data

### Inputs:

**filepath_in**<br>
*Directory to read the input file from*

**input_filename**<br> 
*Name of the input file*

**filepath_out**<br>
*Directory to write the output file*

**output_filename**<br> 
*Name of the output file*

**chunk_size**<br>
*How many lines to read before processing and writing*
* *too few and you'll have too many write operations*
* *too many and you'll use too much memory*

In [None]:
filepath_in = ''
input_filename = ''
filepath_out = ''
output_filename = ''
chunk_size = 100000

# For single file use
parse_file(filepath_in + input_filename, chunk_size, filepath_out + output_filename)

## Setting Up Revised Functions

In [9]:
import os
import pandas as pd
import json
import csv 

# Function for writing output
def write_top10_file(df, fileout):
    top10 = ['AskReddit', 'pics', 'funny', 'videos', 'todayilearned', 'AdviceAnimals', 'news', 'WTF', 'worldnews', 'nfl']
    t = df[(df['subreddit'].isin(top10)) & (df['parent_id'].str.startswith('t3_', na=False))]
    print 'Writing', t.shape[0], 'entries.'
    for subreddit in top10:
        if os.path.isfile(fileout + '_' + subreddit + '.csv'):
            t[t['subreddit'] == subreddit].to_csv(fileout + '_' + subreddit + '.csv', mode='a', header=False, encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
        else:
            t[t['subreddit'] == subreddit].to_csv(fileout + '_' + subreddit + '.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

# Function for reading file with given parameters
def parse_file_for_top10(filein, amount, fileout):
    cols = ['archived','author','author_flair_css_class','author_flair_text','body','controversiality','created_utc','distinguished','downs','edited','gilded','id','link_id','name','parent_id','removal_reason','retrieved_on','score','score_hidden','subreddit','subreddit_id','ups']
    print 'Opening', filein
    with open(filein, 'r') as f:
        itt = 1 # Line counter
        lines = []
        for line in f:
            lines.append(json.loads(line[:-1]))
            itt+=1
            if itt > amount: # If chunk_size is reached, write to csv and clear memory
                if lines:
                    write_top10_file(pd.DataFrame(lines, columns=cols), fileout)
                lines = [] # Clear list (free memory)
                itt = 1 # Reset line counter
        if lines: # Write final lines of file (if any)
            write_top10_file(pd.DataFrame(lines, columns=cols), fileout)
        lines = []
    print 'Complete! All top10 subreddit records from', filein, 'written to', fileout + '_[subreddit].csv'


In [10]:
filepath_in = 'C:/Users/jsmoo/Desktop/Reddit Data/2015_reddit_comments_corpus/reddit_data/2015/'
input_filename = 'RC_2015-01'
filepath_out = 'C:/Users/jsmoo/Desktop/01/'
output_filename = 'RC_2015-01_toplevel'
chunk_size = 400000

# For single file use
parse_file_for_top10(filepath_in + input_filename, chunk_size, filepath_out + output_filename)

Opening C:/Users/jsmoo/Desktop/Reddit Data/2015_reddit_comments_corpus/reddit_data/2014/RC_2014-12


IOError: [Errno 2] No such file or directory: 'C:/Users/jsmoo/Desktop/Reddit Data/2015_reddit_comments_corpus/reddit_data/2014/RC_2014-12'

| subreddit     | gilded | ungilded |  total  |
|---------------|:------:|:--------:|:-------:|
| AdviceAnimals |   552  |  807547  |  808099 |
| AskReddit     |  6194  |  9017540 | 9023734 |
| funny         |   910  |  1483540 | 1484450 |
| news          |   524  |  501371  |  501895 |
| nfl           |   454  |  877144  |  877598 |
| pics          |   955  |  1264050 | 1265005 |
| todayilearned |   475  |  637739  |  638214 |
| videos        |   695  |  876532  |  877227 |
| worldnews     |   443  |  540900  |  541343 |
| WTF           |   490  |  725334  |  725824 |

In [27]:
import pandas as pd

relavant = ['author', 'author_flair_css_class', 'author_flair_text', 'body', 'created_utc', 'distinguished', 'edited', 'gilded', 'score', 'subreddit']

subreddit = 'AskReddit'

data = pd.read_csv('C:/Users/jsmoo/Desktop/2015/05/RC_2015-05_toplevel_' + subreddit + '.csv', usecols=relavant)
data = data.append(pd.read_csv('C:/Users/jsmoo/Desktop/2015/04/RC_2015-04_toplevel_' + subreddit + '.csv', usecols=relavant), ignore_index=True)
data = data.append(pd.read_csv('C:/Users/jsmoo/Desktop/2015/03/RC_2015-03_toplevel_' + subreddit + '.csv', usecols=relavant), ignore_index=True)
data = data.append(pd.read_csv('C:/Users/jsmoo/Desktop/2015/02/RC_2015-02_toplevel_' + subreddit + '.csv', usecols=relavant), ignore_index=True)
data = data.append(pd.read_csv('C:/Users/jsmoo/Desktop/2015/01/RC_2015-01_toplevel_' + subreddit + '.csv', usecols=relavant), ignore_index=True)

print data[data['gilded'] > 0].shape[0], data[data['gilded'] == 0].shape[0], data.shape[0]

6194 9017540 9023734


In [28]:
from sklearn.utils import shuffle
import csv

# Sample 400 of each type
gilded = data[data['gilded'] > 0].sample(400) #.sort('gilded', ascending=False)[:400]
ungilded = data[data['gilded'] == 0].sample(400) #.sort('ups', ascending=False)[:400]

# Pull 200 from each and put into train/test sets
train = gilded[:200].append(ungilded[:200], ignore_index=True)
test = gilded[200:].append(ungilded[200:], ignore_index=True)

shuffle(train).to_csv('W:/CSCE489/Start Data/Train/' + subreddit + '_train.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
shuffle(test).to_csv('W:/CSCE489/Start Data/Test/' + subreddit + '_test.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [30]:
data = None