In [2]:
import pandas as pd
import csv

#### Preprocess the_donald language model base data

In [4]:
#Load Data
col_types = {'author': str, 'body':str, 'subreddit':str, 'created_utc':int, 'score': int}
td_df = pd.read_csv('./data/language_models/td_langsampdata_000000000000', dtype = col_types)
td_df = td_df.append(pd.read_csv('./data/language_models/td_langsampdata_000000000001', dtype = col_types), ignore_index=True)

In [5]:
#Preprocess Data
td_df = td_df[td_df['subreddit'] == 'The_Donald'] #Filter by relevant subreddit
td_df = td_df[(td_df['created_utc'] >= 1454284800) & (td_df['created_utc'] < 1468540800)] #Filter by relevant time frame 

td_df = td_df.sort_values(['created_utc','author']).reset_index(drop=True) #Sort Data
td_df['body'] = td_df['body'].apply(lambda x: str(x)) #fix exceptions [was reading as NaN instead of string]

#Replace open parens with space (to account for hyperlink formatting on reddit)
td_df['body'] = td_df['body'].apply(lambda x: x.replace(r"(","  ")) 
td_df['body'] = td_df['body'].apply(lambda x: x.replace(r")","  ")) 

In [6]:
#Group data s.t. authors/bodies are grouped together
td_df = td_df.groupby('author')['body'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
#This should result in 1 combined string per person (each person posts one "super post" made up of all their posts)

#Import and utilize a function from Jack's script which roughly sanitizes input strings
from fighting_words_py3 import basic_sanitize 
td_df['body'] = td_df['body'].apply(lambda x: basic_sanitize(x).split()) 

#Replace all strings which start with "http" with a marker for hyperlinks
td_df['body'] = td_df['body'].apply(lambda x: [i if i[0:4] != 'http' else '<HYPERLINK>' for i in x])
td_df.head(10)

Unnamed: 0,author,body
0,-nyx-,"[the, ruling, coalition, already, doesnt, have..."
1,0piat3,"[do, you, think, the, black, gun, is, scarier,..."
2,1000Clowns,"[if, that, happens, this, republican, since, 1..."
3,1000stomachcrunches,"[well, yeah, despite, being, the, best, and, b..."
4,1011011,"[if, thats, the, case, she, should, be, pullin..."
5,11234a3,"[yeesh, who, jerks, it, to, 3d, women, anymore..."
6,1337and0,"[aw, dang, i, shouldve, changed, ricky, to, re..."
7,138glv,"[god, dam, rekt, we, can, all, agree, induce, ..."
8,13inchpoop,"[women, are, always, the, ones, bitching, abou..."
9,1875coalminer,"[whoa, put, a, nsfw, tag, on, that, too, much,..."


In [7]:
#Export data
td_df.to_csv('./data/language_models/td_preproc_langmodel.csv', index=False)

#### Preprocess sandersforpresident language model base data

In [10]:
#Load Data
sfp_df = pd.read_csv('./data/language_models/sfp_langmodeldata_000000000000')
sdf_df = pd.read_csv('./data/language_models/sfp_langmodeldata_000000000001', dtype = col_types)
#sfp_df = sfp_df.append(pd.read_pickle('./data/language_models/sfp_langmodeldata_000000000001.p'), ignore_index=True)

In [11]:
#Preprocess Data
sfp_df = sfp_df[sfp_df['subreddit'] == 'SandersForPresident'] #Filter by relevant subreddit
sfp_df = sfp_df[(sfp_df['created_utc'] >= 1454284800) & (sfp_df['created_utc'] < 1468540800)] #Filter by relevant time frame 

sfp_df = sfp_df.sort_values(['created_utc','author']).reset_index(drop=True) #Sort Data
sfp_df['body'] = sfp_df['body'].apply(lambda x: str(x)) #fix exceptions [was reading as NaN instead of string]

#Replace open parens with space (to account for hyperlink formatting on reddit)
sfp_df['body'] = sfp_df['body'].apply(lambda x: x.replace(r"(","  ")) 
sfp_df['body'] = sfp_df['body'].apply(lambda x: x.replace(r")","  ")) 

In [12]:
#Group data s.t. authors/bodies are grouped together
sfp_df = sfp_df.groupby('author')['body'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
#This should result in 1 combined string per person (each person posts one "super post" made up of all their posts)

#Import and utilize a function from Jack's script which roughly sanitizes input strings
from fighting_words_py3 import basic_sanitize 
sfp_df['body'] = sfp_df['body'].apply(lambda x: basic_sanitize(x).split()) 

#Replace all strings which start with "http" with a marker for hyperlinks
sfp_df['body'] = sfp_df['body'].apply(lambda x: [i if i[0:4] != 'http' else '<HYPERLINK>' for i in x])
sfp_df.head(10)

Unnamed: 0,author,body
0,-Gaka-,"[hes, gotten, voices, in, the, democratic, par..."
1,-GheeButtersnaps-,"[better, her, than, the, rest, yeah, this, is,..."
2,-MOPPET-,"[i, missed, the, registration, deadline, for, ..."
3,-SHMOHAWK-,"[yeah, please, fill, me, in, wonder, what, the..."
4,-Tesserex-,"[i, dont, know, why, but, when, i, read, send,..."
5,-chia-,"[just, common, sense, from, bernie, here, no, ..."
6,0116316,"[i, dont, have, a, choice, my, work, leaves, c..."
7,0826,"[me, too, i, was, starting, to, think, this, w..."
8,0beatbryce,"[ummm, have, you, read, the, comments, people,..."
9,0m3r7a,"[right, its, amazing, that, people, completely..."


In [13]:
#Export data
sfp_df.to_csv('./data/language_models/sfp_preproc_langmodel.csv', index=False)

In [412]:
sfp_df.head()

Unnamed: 0,author,body,subreddit,created_utc,score
0,Jesuslordofporn,"If that is the case, I wonder if he would make...",SandersForPresident,1454297222,0
1,2gurl2gurl,The key through getting legislation through Co...,SandersForPresident,1454302379,5
2,2gurl2gurl,They win because we sit at home unless we are ...,SandersForPresident,1454302485,2
3,jrrl,"Or, better, spend those 30 minutes meeting oth...",SandersForPresident,1454305172,11
4,Smittyblack,Does he have a guy coming in early for himself...,SandersForPresident,1454306535,6
