## Setting Up Functions

Here I set up the functions we will use to parse the files.        

In [None]:
import pandas as pd
import json

# Function for writing output
def write_file(df, fileout):
    gilded = df[df['gilded'] > 0]
    print 'Writing', gilded.shape[0], 'entries to', fileout
    if os.path.isfile(fileout):
        gilded.to_csv(fileout, mode='a', header=False, encoding='utf-8', index=False)
    else:
        gilded.to_csv(fileout, encoding='utf-8', index=False)

# Function for reading file with given parameters
def parse_file(filein, amount, fileout):
    print 'Opening', filein
    with open(filein, 'r') as f:
        itt = 1 # Line counter
        lines = []
        for line in f:
            lines.append(json.loads(line[:-1]))
            itt+=1
            if itt > amount: # If chunk_size is reached, write to csv and clear memory
                if lines:
                    write_file(pd.DataFrame(lines), fileout)
                lines = [] # Clear list (free memory)
                itt = 1 # Reset line counter
        if lines: # Write final lines of file (if any)
            write_file(pd.DataFrame(lines), fileout)
            lines = []
    print 'Complete! All gilded records from', filein, 'written to', fileout            

## Parsing Data

### Inputs:

**filepath_in**<br>
*Directory to read the input file from*

**input_filename**<br> 
*Name of the input file*

**filepath_out**<br>
*Directory to write the output file*

**output_filename**<br> 
*Name of the output file*

**chunk_size**<br>
*How many lines to read before processing and writing*
* *too few and you'll have too many write operations*
* *too many and you'll use too much memory*

In [None]:
filepath_in = ''
input_filename = ''
filepath_out = ''
output_filename = ''
chunk_size = 100000

# For single file use
parse_file(filepath_in + input_filename, chunk_size, filepath_out + output_filename)

## Setting Up Revised Functions

In [None]:
import os
import pandas as pd
import json
import csv 

# Function for writing output
def write_top10_file(df, fileout):
    top10 = ['AskReddit', 'pics', 'funny', 'videos', 'todayilearned', 'AdviceAnimals', 'news', 'WTF', 'worldnews', 'nfl']
    t = df[(df['subreddit'].isin(top10)) & (df['parent_id'].str.startswith('t3_', na=False))]
    print 'Writing', t.shape[0], 'entries.'
    for subreddit in top10:
        if os.path.isfile(fileout + '_' + subreddit + '.csv'):
            t[t['subreddit'] == subreddit].to_csv(fileout + '_' + subreddit + '.csv', mode='a', header=False, encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
        else:
            t[t['subreddit'] == subreddit].to_csv(fileout + '_' + subreddit + '.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

# Function for reading file with given parameters
def parse_file_for_top10(filein, amount, fileout):
    print 'Opening', filein
    with open(filein, 'r') as f:
        itt = 1 # Line counter
        lines = []
        for line in f:
            lines.append(json.loads(line[:-1]))
            itt+=1
            if itt > amount: # If chunk_size is reached, write to csv and clear memory
                if lines:
                    write_top10_file(pd.DataFrame(lines), fileout)
                lines = [] # Clear list (free memory)
                itt = 1 # Reset line counter
        if lines: # Write final lines of file (if any)
            write_top10_file(pd.DataFrame(lines), fileout)
        lines = []
    print 'Complete! All top10 subreddit records from', filein, 'written to', fileout + '_[subreddit].csv'


In [None]:
filepath_in = 'C:/Users/jsmoo/Desktop/Reddit Data/2015_reddit_comments_corpus/reddit_data/2015/'
input_filename = 'RC_2015-01'
filepath_out = 'W:/CSCE489/Start Data/test/'
output_filename = 'RC_2015-01_toplevel'
chunk_size = 300000

# For single file use
parse_file_for_top10(filepath_in + input_filename, chunk_size, filepath_out + output_filename)

In [6]:
import pandas as pd

data = pd.read_csv('W:/CSCE489/Start Data/test/RC_2015-01_toplevel_AdviceAnimals.csv')

# check1 = data['gilded'].apply(lambda x: str(x).isdigit())
# check2 = data['score'].apply(lambda x: str(x).isdigit())
# check3 = data['ups'].apply(lambda x: str(x).isdigit())

# data = data[(check1 == True)]

print 'gilded:', data['gilded'].dtype
# print data['gilded'].unique()
print 'score:', data['score'].dtype
# print data['score'].unique()
print 'ups:', data['ups'].dtype
# print data['ups'].unique()

gilded: int64
score: int64
ups: int64
