In [11]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime

### function to read parquet files into one dataframe

In [12]:
def read_parquet_df(file_list):
    '''
    read multiple parquet files into one dataframe
    sources: 
    https://stackoverflow.com/questions/57067551/

    :param file_list:
    :return: a dataframe
    '''
    dfs = []  # an empty list to store the data frames
    for file in file_list:
        data = pd.read_parquet(file)  # read data frame from parquet file
        dfs.append(data)  # append the data frame to the list
    # concatenate all the data frames in the list
    temp = pd.concat(dfs, ignore_index=True)
    return temp

### function to convert created_utc column to timestamp and truncate selftext and body columns

In [13]:
    def convert_time_truncate (dataframe):
        '''
        convert created_utc unix timestamp to normal timestamp and truncate text
        source: https://stackoverflow.com/questions/16801162/ - for info about created_utc
        https://stackoverflow.com/questions/19231871/convert-unix-time-to-readable-date-in-pandas-dataframe
        https://stackoverflow.com/questions/27722658/set-max-string-length-in-pandas
        :param dataframe: a dataframe with created_utc column in unix timestamp
        :return: a dataframe
        '''
        dataframe['created_utc'] = pd.to_datetime(pd.to_numeric(dataframe['created_utc']),unit='s')
        if 'body' in dataframe.columns:
            dataframe['body'] = dataframe['body'].str[:512]
        else:
            dataframe['selftext'] = dataframe['selftext'].str[:512]
        return dataframe 

### get file paths and read all parquet in each as dataframe

In [14]:
    all_activity = 'reddit-us-election-expanded2/all_activity/'
    comments = 'reddit-us-election-expanded2/election_comments'
    posts = 'reddit-us-election-expanded2/election_submissions'

    # get list of all .parquet files in each folder
    all_list = glob.glob(os.path.join(all_activity, '*.parquet'))
    comments_list = glob.glob(os.path.join(comments, '*.parquet'))
    posts_list = glob.glob(os.path.join(posts, '*.parquet'))

    # store data from each folder into their own dataframes
    all_activity_df = read_parquet_df(all_list)
    all_activity_df = convert_time_truncate(all_activity_df)
    comments_df = read_parquet_df(comments_list)
    comments_df = convert_time_truncate(comments_df)
    posts_df = read_parquet_df(posts_list)
    posts_df = convert_time_truncate(posts_df)

### view dataframes

In [15]:
all_activity_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,Anen-o-me,d3uhcdd,Prepare to be disappointed.,Libertarian,t5_2qh63,2016-06-03 14:26:27,2,2016,6
1,IPredictAReddit,cyot53l,&gt; You basically can't run a business withou...,Libertarian,t5_2qh63,2016-01-07 01:38:46,1,2016,1
2,Mentalpopcorn,d7atwp8,"Yeah but the Libertarian Party becoming ""moder...",Libertarian,t5_2qh63,2016-09-05 23:23:59,-9,2016,9
3,Rothbardgroupie,d4aifaj,I'm for companies that compete to provide meth...,Libertarian,t5_2qh63,2016-06-15 18:29:13,2,2016,6
4,untitleddocument37,dbsr80w,No. No They Won't.\n\nCanada is far far far mo...,Libertarian,t5_2qh63,2016-12-30 16:16:57,0,2016,12
...,...,...,...,...,...,...,...,...,...
64642,dustcircle2,5glkgj,What changes when Pope Francis grants all prie...,prochoice,t5_2rg0n,2016-12-05 12:06:48,13,2016,12
64643,birdinthebush74,5h33kn,The sharpest drops in abortion rates have been...,prochoice,t5_2rg0n,2016-12-07 22:39:10,31,2016,12
64644,birdinthebush74,43kcco,Women charged with murder after self aborting,prochoice,t5_2rg0n,2016-01-31 19:00:42,17,2016,1
64645,DeannDotson,48dsqx,The Most Important Abortion Case You Never Hea...,prochoice,t5_2rg0n,2016-03-01 01:35:30,1,2016,3


In [16]:
comments_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,Anen-o-me,d3uhcdd,Prepare to be disappointed.,Libertarian,t5_2qh63,2016-06-03 14:26:27,2,2016,6
1,IPredictAReddit,cyot53l,&gt; You basically can't run a business withou...,Libertarian,t5_2qh63,2016-01-07 01:38:46,1,2016,1
2,Mentalpopcorn,d7atwp8,"Yeah but the Libertarian Party becoming ""moder...",Libertarian,t5_2qh63,2016-09-05 23:23:59,-9,2016,9
3,Rothbardgroupie,d4aifaj,I'm for companies that compete to provide meth...,Libertarian,t5_2qh63,2016-06-15 18:29:13,2,2016,6
4,untitleddocument37,dbsr80w,No. No They Won't.\n\nCanada is far far far mo...,Libertarian,t5_2qh63,2016-12-30 16:16:57,0,2016,12
...,...,...,...,...,...,...,...,...,...
35697,EnigmaticTortoise,d0u1nj7,That's completely false. Cruz will take more t...,Conservative,t5_2qh6p,2016-03-09 23:56:04,0,2016,3
35698,VirginWizard69,dbqh0hk,Awesome. I am going to buy one.,Conservative,t5_2qh6p,2016-12-28 23:33:51,8,2016,12
35699,MasterTacticianAlba,d0j53as,"&gt; She shouted ""I hate democracy. I am a ter...",Conservative,t5_2qh6p,2016-03-01 07:53:44,1,2016,3
35700,aboutaprettysunset,d0uc8cw,This is news? Saudi money is involved with eve...,Conservative,t5_2qh6p,2016-03-10 04:23:54,0,2016,3


In [17]:
posts_df

Unnamed: 0,author,id,title,selftext,subreddit,subreddit_id,created_utc,score,year,month
0,cencalfeminist,5kvd8d,"[CA, USA]Question about citizenship and gettin...",Hello r/immigration this question may have bee...,immigration,t5_2qvj0,2016-12-29 06:23:48,1,2016,12
1,Ilikethishandle,3z3r80,Immigrating to USA (via marriage) but want to ...,"Hey all,\nI'll be moving to the USA within the...",immigration,t5_2qvj0,2016-01-02 04:54:06,2,2016,1
2,ipswitch_master,52eilb,ssn application was returned...,My mailed ssn application was returned to me b...,immigration,t5_2qvj0,2016-09-12 14:13:50,2,2016,9
3,aLbie215,41vs8i,Reaching out to Reddit for a very quick immigr...,My SO and I are planning to get married within...,immigration,t5_2qvj0,2016-01-20 19:00:51,2,2016,1
4,silversurfer1987,48du3m,Getting my girlfriend to come to USA from Croa...,"Hi, I did a search and didn't find much.\n\nBa...",immigration,t5_2qvj0,2016-03-01 01:44:10,1,2016,3
...,...,...,...,...,...,...,...,...,...,...
33813,conantheking,4pf4sd,Donald Trump raises $19M for joint RNC campaig...,,AmericanPolitics,t5_2qied,2016-06-23 05:23:29,2,2016,6
33814,TotalNewsTV,405ovk,BREAKING : A CIVIL WAR IN THE AMERICAN GOVERNM...,,AmericanPolitics,t5_2qied,2016-01-09 09:14:05,1,2016,1
33815,conantheking,524byz,How to Solve the Illegal Immigration Problem,,AmericanPolitics,t5_2qied,2016-09-10 18:12:16,0,2016,9
33816,monkeydeluxe,4nht24,Obama Prolongs Unwinnable Wars,,AmericanPolitics,t5_2qied,2016-06-10 18:47:55,2,2016,6


### Save to CSV or parquet

In [18]:
#all_activity_df.to_parquet('allactivity2.parquet', index=False)
#all_activity_df.to_csv('allactivity.csv', index=False)

In [19]:
#posts_df.to_parquet('allsubmissions2.parquet',index=False)
#posts_df.to_csv('allsubmissions.csv', index=False)


In [20]:
all_activity_df.sample(4000).to_parquet('allactivitysmall2.parquet', index=False)

In [21]:
posts_df.sample(8000).to_parquet('allsubsmall2.parquet', index=False)