In [1]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime

### function to read parquet files into one dataframe

In [2]:
def read_parquet_df(file_list):
    '''
    read multiple parquet files into one dataframe
    sources: 
    https://stackoverflow.com/questions/57067551/

    :param file_list:
    :return: a dataframe
    '''
    dfs = []  # an empty list to store the data frames
    for file in file_list:
        data = pd.read_parquet(file)  # read data frame from parquet file
        dfs.append(data)  # append the data frame to the list
    # concatenate all the data frames in the list
    temp = pd.concat(dfs, ignore_index=True)
    return temp

### function to convert created_utc column to timestamp and truncate selftext and body columns

In [3]:
    def convert_time_truncate (dataframe):
        '''
        convert created_utc unix timestamp to normal timestamp and truncate text
        source: https://stackoverflow.com/questions/16801162/ - for info about created_utc
        https://stackoverflow.com/questions/19231871/convert-unix-time-to-readable-date-in-pandas-dataframe
        https://stackoverflow.com/questions/27722658/set-max-string-length-in-pandas
        :param dataframe: a dataframe with created_utc column in unix timestamp
        :return: a dataframe
        '''
        dataframe['created_utc'] = pd.to_datetime(pd.to_numeric(dataframe['created_utc']),unit='s')
        if 'body' in dataframe.columns:
            dataframe['body'] = dataframe['body'].str[:512]
        else:
            dataframe['selftext'] = dataframe['selftext'].str[:512]
        return dataframe 

### get file paths and read all parquet in each as dataframe

In [4]:
    all_activity = 'reddit-us-election-expanded2/all_activity/'
    comments = 'reddit-us-election-expanded2/election_comments'
    posts = 'reddit-us-election-expanded2/election_submissions'

    # get list of all .parquet files in each folder
    all_list = glob.glob(os.path.join(all_activity, '*.parquet'))
    comments_list = glob.glob(os.path.join(comments, '*.parquet'))
    posts_list = glob.glob(os.path.join(posts, '*.parquet'))

    # store data from each folder into their own dataframes
    all_activity_df = read_parquet_df(all_list)
    all_activity_df = convert_time_truncate(all_activity_df)
    comments_df = read_parquet_df(comments_list)
    comments_df = convert_time_truncate(comments_df)
    posts_df = read_parquet_df(posts_list)
    posts_df = convert_time_truncate(posts_df)

### view dataframes

In [5]:
all_activity_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,Anen-o-me,d3uhcdd,Prepare to be disappointed.,Libertarian,t5_2qh63,2016-06-03 14:26:27,2,2016,6
1,IPredictAReddit,cyot53l,&gt; You basically can't run a business withou...,Libertarian,t5_2qh63,2016-01-07 01:38:46,1,2016,1
2,Mentalpopcorn,d7atwp8,"Yeah but the Libertarian Party becoming ""moder...",Libertarian,t5_2qh63,2016-09-05 23:23:59,-9,2016,9
3,Rothbardgroupie,d4aifaj,I'm for companies that compete to provide meth...,Libertarian,t5_2qh63,2016-06-15 18:29:13,2,2016,6
4,untitleddocument37,dbsr80w,No. No They Won't.\n\nCanada is far far far mo...,Libertarian,t5_2qh63,2016-12-30 16:16:57,0,2016,12
...,...,...,...,...,...,...,...,...,...
64642,dustcircle2,5glkgj,What changes when Pope Francis grants all prie...,prochoice,t5_2rg0n,2016-12-05 12:06:48,13,2016,12
64643,birdinthebush74,5h33kn,The sharpest drops in abortion rates have been...,prochoice,t5_2rg0n,2016-12-07 22:39:10,31,2016,12
64644,birdinthebush74,43kcco,Women charged with murder after self aborting,prochoice,t5_2rg0n,2016-01-31 19:00:42,17,2016,1
64645,DeannDotson,48dsqx,The Most Important Abortion Case You Never Hea...,prochoice,t5_2rg0n,2016-03-01 01:35:30,1,2016,3


In [6]:
comments_df

Unnamed: 0,author,id,body,subreddit,subreddit_id,created_utc,score,year,month
0,[deleted],d41ir1j,"Wow, someone's been reading their Chomsky.\n\n...",democrats,t5_2qn70,2016-06-08 22:18:14,0,2016,6
1,michaelconfoy,czgtok2,"Wow, people with money shouldn't be liberal. H...",democrats,t5_2qn70,2016-01-29 20:46:31,1,2016,1
2,RealRepub,d7qhx7s,From the article.,democrats,t5_2qn70,2016-09-17 13:21:19,1,2016,9
3,AutoModerator,czh0niu,/r/democrats does not allow query strings in u...,democrats,t5_2qn70,2016-01-29 23:43:48,1,2016,1
4,VegaThePunisher,db9yk11,Who the fuck ever said they don't matter? \n\n,democrats,t5_2qn70,2016-12-16 17:39:21,3,2016,12
...,...,...,...,...,...,...,...,...,...
35697,WingzofIsis,d0znigo,I'll join you and I will add Fox News to that ...,uspolitics,t5_2qwlq,2016-03-14 18:56:58,1,2016,3
35698,SchlangeHatRecht,dbmbe70,I voted for Trump. I was banned from r/uncenso...,uspolitics,t5_2qwlq,2016-12-25 19:00:02,3,2016,12
35699,spaceghoti,d84ohqe,A non-supernatural version of Carrie?,uspolitics,t5_2qwlq,2016-09-27 21:41:06,2,2016,9
35700,donaldtrumptwat,czhtqai,....... And what a team !,uspolitics,t5_2qwlq,2016-01-30 19:42:04,1,2016,1


In [7]:
posts_df

Unnamed: 0,author,id,title,selftext,subreddit,subreddit_id,created_utc,score,year,month
0,[deleted],5ilhlt,"""Wonderful World"" -- a short film about Trump'...",[deleted],Liberal,t5_2qxt5,2016-12-16 01:24:56,1,2016,12
1,HomelessYouthSupport,5g0dff,Help Indigenous trans person maintain housing ...,,Liberal,t5_2qxt5,2016-12-02 01:10:05,1,2016,12
2,AmericanLesionX,406owp,Obama Shuts Down ‘American Sniper’ Widow Who S...,,Liberal,t5_2qxt5,2016-01-09 15:39:40,4,2016,1
3,[deleted],4no8d3,"Laid-Off Americans, Required to Zip Lips on Wa...",[deleted],Liberal,t5_2qxt5,2016-06-12 01:25:14,7,2016,6
4,progress18,4cn21s,Donald Trump struggles to clarify abortion rem...,,Liberal,t5_2qxt5,2016-03-30 22:01:49,3,2016,3
...,...,...,...,...,...,...,...,...,...,...
33813,anutensil,4m4rc4,Iowa Governor Steps Into A Tangled Web Of Chur...,,democrats,t5_2qn70,2016-06-02 01:35:57,16,2016,6
33814,tawtaw,40u7ii,"The Republican Party’s 50-State Solution - ""Th...",,democrats,t5_2qn70,2016-01-13 21:20:33,5,2016,1
33815,kcmill12,3zw27d,Bernie Sanders is Headed to The White House,,democrats,t5_2qn70,2016-01-07 15:59:46,18,2016,1
33816,[deleted],49uu1t,Stop Bernie-Splaining to Black Voters,[deleted],democrats,t5_2qn70,2016-03-10 18:50:37,1,2016,3


### Save to CSV or parquet

In [8]:
#all_activity_df.to_parquet('allactivity2.parquet', index=False)
#all_activity_df.to_csv('allactivity.csv', index=False)

In [9]:
#posts_df.to_parquet('allsubmissions2.parquet',index=False)
#posts_df.to_csv('allsubmissions.csv', index=False)


In [10]:
# all_activity_df.sample(4000).to_parquet('allactivitysmall2.parquet', index=False)

In [11]:
# posts_df.sample(8000).to_parquet('allsubsmall2.parquet', index=False)