In [1]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime

### function to read parquet files into one dataframe

In [2]:
def read_parquet_df(file_list):
    '''
    read multiple parquet files into one dataframe
    sources: 
    https://stackoverflow.com/questions/57067551/

    :param file_list:
    :return: a dataframe
    '''
    dfs = []  # an empty list to store the data frames
    for file in file_list:
        data = pd.read_parquet(file)  # read data frame from parquet file
        dfs.append(data)  # append the data frame to the list
    # concatenate all the data frames in the list
    temp = pd.concat(dfs, ignore_index=True)
    return temp

### function to convert created_utc column to timestamp and truncate selftext and body columns

In [3]:
    def convert_time_truncate (dataframe):
        '''
        convert created_utc unix timestamp to normal timestamp and truncate text
        source: https://stackoverflow.com/questions/16801162/ - for info about created_utc
        https://stackoverflow.com/questions/19231871/convert-unix-time-to-readable-date-in-pandas-dataframe
        https://stackoverflow.com/questions/27722658/set-max-string-length-in-pandas
        :param dataframe: a dataframe with created_utc column in unix timestamp
        :return: a dataframe
        '''
        dataframe['created_utc'] = pd.to_datetime(pd.to_numeric(dataframe['created_utc']),unit='s')
        if 'body' in dataframe.columns:
            dataframe['body'] = dataframe['body'].str[:512]
        else:
            dataframe['selftext'] = dataframe['selftext'].str[:512]
        return dataframe 

### get file paths and read all parquet in each as dataframe

In [4]:
    all_activity = 'reddit-us-election-expanded/all_activity/'
    comments = 'reddit-us-election-expanded/election_comments'
    posts = 'reddit-us-election-expanded/election_submissions'

    # get list of all .parquet files in each folder
    all_list = glob.glob(os.path.join(all_activity, '*.parquet'))
    comments_list = glob.glob(os.path.join(comments, '*.parquet'))
    posts_list = glob.glob(os.path.join(posts, '*.parquet'))

    # store data from each folder into their own dataframes
    all_activity_df = read_parquet_df(all_list)
    all_activity_df = convert_time_truncate(all_activity_df)
    comments_df = read_parquet_df(comments_list)
    comments_df = convert_time_truncate(comments_df)
    posts_df = read_parquet_df(posts_list)
    posts_df = convert_time_truncate(posts_df)

### view dataframes

In [6]:
all_activity_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,JackassWhisperer,41jo0t,Texas' New Open-Carry Law Unpopular Among Some...,politics,t5_2cneq,2016-01-18 16:31:34,0,2016,1
1,romaniastronk,4m4fd8,Half of America thinks Hillary Clintons Emails...,politics,t5_2cneq,2016-06-02 00:15:44,1,2016,6
2,SavageLondon,54r400,Guns Facts USA. It is time for all Americans t...,politics,t5_2cneq,2016-09-27 15:30:31,0,2016,9
3,HamsterSandwich,540n8n,Donald Trump claims New York’s stop-and-frisk ...,politics,t5_2cneq,2016-09-22 19:10:56,1680,2016,9
4,beta_white_male,5im85c,Obama birth certificate investigation: Sheriff...,politics,t5_2cneq,2016-12-16 04:01:45,1,2016,12
...,...,...,...,...,...,...,...,...,...
43808,Shares_RSS,5gzzhl,Russia plans Yuan OFZ bond roadshow on Dec. 22...,Economics,t5_2qh1s,2016-12-07 13:54:21,1,2016,12
43809,Fittyakaferrari,4avjsh,Noahpinion: Autor on EconTalk,Economics,t5_2qh1s,2016-03-17 22:45:01,0,2016,3
43810,Shares_RSS,41vo1l,UPDATE 2-Bank of Canada holds rates steady as ...,Economics,t5_2qh1s,2016-01-20 18:38:27,1,2016,1
43811,Shares_RSS,4o2ct3,Jersey-based asset manager acquires Swedish is...,Economics,t5_2qh1s,2016-06-14 16:51:19,1,2016,6


In [7]:
comments_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,zyi21,d7sc190,"What if, for example, a relative of mine sends...",immigration,t5_2qvj0,2016-09-18 21:57:39,1,2016,9
1,greencardtiming,czcacff,"By partner, do you mean you're married? Becaus...",immigration,t5_2qvj0,2016-01-26 06:50:07,2,2016,1
2,ohheyhowareyoutoday,cymkg82,Welcome :),immigration,t5_2qvj0,2016-01-05 07:48:57,1,2016,1
3,OrlandoPokey,d7bldrw,It's like that here too for motorcyclesale. G...,immigration,t5_2qvj0,2016-09-06 15:45:48,1,2016,9
4,yesthisisdaniel,dav5meg,What if I'm unable to get any proof that I liv...,immigration,t5_2qvj0,2016-12-06 18:45:18,1,2016,12
...,...,...,...,...,...,...,...,...,...
23995,Wampawacka,d4n2xyl,Agreed. The rest of Europe has much to gain bu...,NeutralPolitics,t5_2tk0i,2016-06-25 02:22:42,3,2016,6
23996,huadpe,d83q0da,Can you source please?,NeutralPolitics,t5_2tk0i,2016-09-27 04:24:59,1,2016,9
23997,Gnome_Sane,d0wmkps,&gt; Obama got Bin Laden.\n\nThat grab was sev...,NeutralPolitics,t5_2tk0i,2016-03-12 00:23:01,0,2016,3
23998,GandhiMSF,db8fwup,"Correct, there was some support for them outsi...",NeutralPolitics,t5_2tk0i,2016-12-15 17:29:36,5,2016,12


In [8]:
posts_df

Unnamed: 0,author,id,title,selftext,subreddit,subreddit_id,created_utc,score,year,month
0,YoungBrownSocrates,5fuc3z,Critical look at Standing Rock. Thoughts?,,Libertarian,t5_2qh63,2016-12-01 03:34:19,1,2016,12
1,granthonyj,51a96t,How have the Gary shills reconciled his disast...,Dude is barely even qualified to call himself ...,Libertarian,t5_2qh63,2016-09-05 16:40:14,0,2016,9
2,nolandus,53hpzd,A Trump Empire Built on Inside Connections and...,,Libertarian,t5_2qh63,2016-09-19 14:11:43,45,2016,9
3,bassistb0y,4nir4e,If you are shitting on Gary Johnson and you th...,,Libertarian,t5_2qh63,2016-06-10 22:14:51,2,2016,6
4,punkthesystem,439bdr,Help send C4SS prison reform/abolition panel t...,,Libertarian,t5_2qh63,2016-01-29 15:45:28,1,2016,1
...,...,...,...,...,...,...,...,...,...,...
23505,82364,4omz7y,Is healthcare as politicized an issue in Germa...,[removed],NeutralPolitics,t5_2tk0i,2016-06-18 03:53:36,1,2016,6
23506,mickey_patches,5l4f2v,U.S. Sanctions on Russia,[removed],NeutralPolitics,t5_2tk0i,2016-12-30 16:56:49,1,2016,12
23507,[deleted],48kqvp,"If a major, catastrophic world-wide economic c...",[removed],NeutralPolitics,t5_2tk0i,2016-03-02 05:27:05,1,2016,3
23508,TacticalFox88,48oyw4,Is campaign finance reform needed or is all of...,https://berniesanders.com/issues/money-in-poli...,NeutralPolitics,t5_2tk0i,2016-03-02 23:18:44,19,2016,3


### Save to CSV?

In [9]:
#all_activity_df.to_parquet('allactivity.parquet', index=False)
#all_activity_df.to_csv('allactivity.csv', index=False)

In [10]:
#posts_df.to_parquet('allsubmissions.parquet',index=False)
#posts_df.to_csv('allsubmissions.csv', index=False)


In [6]:
#posts_df.sample(4000).to_parquet('allsubsmall.parquet', index=False)