In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm import tqdm
import ast

#### Preprocess the_donald user data

In [2]:
#Load Data
col_types = {'author': str, 'body':str, 'subreddit':str, 'created_utc':int, 'score': int}
td_df = pd.read_csv('./data/user_histories/td_userhistdata_000000000000', dtype = col_types)
td_df = td_df.append(pd.read_csv('./data/user_histories/td_userhistdata_000000000001', dtype = col_types), ignore_index=True)

In [3]:
#Replace the single bot found in the file
td_df = td_df[td_df['author'] != 'topredditbot'] # remove u/topredditbot
td_df = td_df.append(pd.read_csv('./data/user_histories/td_userhistdata_replacement.csv', dtype = col_types), ignore_index =True)

#Change Unix Time to DateTime 
td_df['created_utc'] = td_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))
td_df.head()

Unnamed: 0,author,body,subreddit,created_utc,score
0,berlinbrown,I know how to subvert the evil tyranny of the ...,reddit.com,2007-03-21 16:30:39,8
1,berlinbrown,"hey, I know you from #lisp.\n\nMy site runs ab...",programming,2008-01-31 17:47:04,1
2,cleverkid,Hahaha! Ultimate prank! Nice..,WTF,2008-11-12 01:00:53,2
3,cleverkid,AMEN!,WeAreTheMusicMakers,2008-12-11 17:51:14,5
4,berlinbrown,"Yet another license nazi.\n\nClearly, google g...",programming,2008-01-17 03:38:14,1


In [4]:
#Get list of authors and thier first post on r/the_donald
first_post_df = td_df[td_df['subreddit'] == 'The_Donald']
first_post_df = first_post_df.groupby(['author']).min().reset_index()

#Covert this to a dictionary for cleaner use later
first_post_dict =pd.DataFrame(first_post_df['created_utc'])
first_post_dict.index = first_post_df['author']
first_post_dict = first_post_dict.to_dict()['created_utc']

first_post_dict

{'MettaWorldPeach': Timestamp('2016-04-03 13:33:53'),
 'danxmason': Timestamp('2016-04-12 10:01:20'),
 'Chewiemuse': Timestamp('2016-03-24 12:04:01'),
 'cmVkZGl0': Timestamp('2016-04-10 22:03:55'),
 'Soezin': Timestamp('2016-04-05 18:48:11'),
 'pbnbj': Timestamp('2016-03-03 03:56:29'),
 'raffraffraff': Timestamp('2016-03-09 04:48:49'),
 'nojob4acowboy': Timestamp('2016-03-12 11:01:32'),
 'thrashertm': Timestamp('2016-02-25 10:08:41'),
 'Le_Pretre': Timestamp('2016-04-15 22:41:16'),
 'tatermonkey': Timestamp('2016-03-27 18:34:18'),
 'tenspeed2': Timestamp('2016-04-09 04:03:41'),
 'SwggrBck': Timestamp('2016-03-04 00:00:40'),
 'FlappingBird': Timestamp('2016-04-28 12:07:19'),
 'T0DDTHEGOD': Timestamp('2016-03-17 03:34:46'),
 'turbodan1': Timestamp('2016-03-24 09:51:00'),
 'Blesbok': Timestamp('2016-03-09 16:15:40'),
 'quebecesti': Timestamp('2016-03-29 10:30:56'),
 'Nyaandere': Timestamp('2016-05-31 01:08:29'),
 'EvilKHANevil': Timestamp('2016-05-27 15:48:08'),
 'wompinator': Timestamp('

In [23]:
from fighting_words_py3 import basic_sanitize 

td_features_df = first_post_df.copy()[['created_utc']]
td_features_df.index = list(first_post_df.copy()['author'])
for num_months in range(0,48): #48 months - (4 years)

    #Init columns to be filled in 
    sub_colname = 'sub_t' + str(-(num_months + 1)) 
    text_colname = 'text_t' + str(-(num_months + 1)) 

    td_features_df.loc[:,sub_colname] = list(first_post_df.copy()['author'])
    td_features_df.loc[:,text_colname] = list(first_post_df.copy()['author'])

    
    for author in tqdm(td_features_df.index):
        #Init author-specific time frame
        starttime = first_post_dict[author] - dt.timedelta(days = 30 * (num_months + 1))
        endtime = first_post_dict[author] - dt.timedelta(days = 30 * num_months)

        #Filter td_df for the above parameters
        filter_td_df = td_df.copy()[(td_df['created_utc'] < endtime) & (td_df['created_utc'] >= starttime) & (td_df['author'] == author)]

        #Check if filter_td_df is empty (no posts made in time frame)
        if len(filter_td_df) == 0: 
            td_features_df.loc[author,sub_colname] = str([])
            td_features_df.loc[author,text_colname] =  str([])
        
        else:
            ###Get where one participated on a subreddit
            td_features_df.loc[author,sub_colname] = str(list(filter_td_df['subreddit'].unique()))

            ###Get tokenized and preprocessed words
            filter_td_df.loc[:,'body'] = filter_td_df['body'].apply(lambda x: str(x))
            filter_td_df.loc[:,'body'] = filter_td_df['body'].apply(lambda x: x.replace(r"(","  ")) 
            filter_td_df.loc[:,'body'] = filter_td_df['body'].apply(lambda x: x.replace(r")","  ")) 

            #Group data s.t. authors/bodies are grouped together
            filter_td_df = filter_td_df.groupby('author')['body'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
            #This should result in 1 combined string per person (each person posts one "super post" made up of all their posts)

            #Import and utilize a function from Jack's script which roughly sanitizes input strings
            filter_td_df.loc[:,'body'] = filter_td_df['body'].apply(lambda x: basic_sanitize(x).split()) 

            #Replace all strings which start with "http" with a marker for hyperlinks
            td_features_df.loc[author,text_colname] =  str(list(filter_td_df['body'].apply(lambda x: [i if i[0:4] != 'http' else '<HYPERLINK>' for i in x])[0]))
td_features_df.to_csv('td_gensubreddit_feats.csv')
td_features_df.head()


  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]
  0%|                                                                                 | 1/2000 [00:00<18:30,  1.80it/s]
  0%|                                                                                 | 2/2000 [00:01<17:32,  1.90it/s]
  0%|                                                                                 | 3/2000 [00:01<16:49,  1.98it/s]
  0%|▏                                                                                | 4/2000 [00:01<16:14,  2.05it/s]
  0%|▏                                                                                | 5/2000 [00:02<15:51,  2.10it/s]
  0%|▏                                                                                | 6/2000 [00:02<14:56,  2.22it/s]
  0%|▎                                                                                | 7/2000 [00:03<15:01,  2.21it/s]
  0%|▎                                 

Unnamed: 0,created_utc,sub_t-1,text_t-1,sub_t-2,text_t-2,sub_t-3,text_t-3,sub_t-4,text_t-4,sub_t-5,...,sub_t-44,text_t-44,sub_t-45,text_t-45,sub_t-46,text_t-46,sub_t-47,text_t-47,sub_t-48,text_t-48
-Beth-,2016-03-18 14:12:41,"['AdviceAnimals', 'wow', 'hearthstone', 'ImGoi...","['why', 'would', 'they', 'be', 'asked', 'direc...","['CringeAnarchy', 'sadcringe', 'atheism', 'rel...","['well', 'it', 'was', 'a', 'huge', 'issue', 'a...","['cringepics', 'gaming', 'MapPorn', 'CringeAna...","['this', 'whole', 'thread', 'baffled', 'me', '...","['CringeAnarchy', 'AdviceAnimals', 'gaming', '...","['this', 'is', 'so', 'spot', 'on', 'ive', 'see...","['BlackPeopleTwitter', 'gaming', 'AdviceAnimal...",...,[],[],[],[],[],[],[],[],[],[]
-Oberlander,2016-03-17 20:18:45,"['MadeMeSmile', 'explainlikeimfive', 'battlefi...","['you', 'dont', 'know', 'what', 'uuranometria'...","['explainlikeimfive', 'Showerthoughts', 'chemi...","['both', 'tos', 'and', 'tng', 'have', 'been', ...","['chemicalreactiongifs', 'AskReddit', 'Battlef...","['the', 'coldest', 'natural', 'place', 'in', '...","['mildlyinteresting', 'tifu', 'Nordiccountries...","['so', 'these', 'stairs', 'will', 'make', 'a',...","['shittyideas', 'AskReddit', 'space', 'mildlyi...",...,[],[],[],[],[],[],[],[],[],[]
-Shank-,2016-02-28 10:45:58,"['leagueoflegends', 'news', 'FortWorth', 'Kota...","['sleeper', 'pick', 'in', 'competitive', 'has'...","['leagueoflegends', 'AskReddit', 'Conservative...","['crumbzz', 'is', 'definitely', 'playing', 'li...","['leagueoflegends', 'Games', 'Dallas', 'Kotaku...","['did', 'you', 'get', 'a', 'onehit', 'body', '...","['leagueoflegends', 'KotakuInAction', 'funny',...","['twitch', 'chat', 'knowingly', 'looks', 'for'...","['KotakuInAction', 'leagueoflegends', 'DotA2',...",...,[],[],[],[],[],[],[],[],[],[]
05banks,2016-02-17 09:46:38,"['soccer', 'food', 'AskReddit', 'gifs', 'me_ir...","['i', 'mean', 'for', 'me', 'id', 'play', 'roon...","['funny', 'Gunners', 'AdviceAnimals', 'soccer'...","['people', 'who', 'suffered', 'fgm', 'wouldnt'...","['worldnews', 'soccer', 'funny', 'BlackPeopleT...","['no', 'hes', 'not', 'not', 'in', 'this', 'con...","['AskReddit', 'BlackPeopleTwitter', 'Gunners',...","['american', 'accents', 'sound', 'camp', 'to',...","['soccer', 'unitedkingdom', 'AskReddit', 'me_i...",...,[],[],['trees'],"['its', 'awful', 'we', 'call', 'it', 'pulling'...",[],[],[],[],[],[]
14000_calories_later,2016-05-09 21:10:58,"['GetMotivated', 'ImGoingToHellForThis', 'spor...","['nope', 'billionaire', 'ufc', 'professional',...","['GetMotivated', 'television', 'gifs', 'pics',...","['then', 'to', 'hold', 'all', 'things', 'equal...","['personalfinance', 'GetMotivated', 'listentot...","['audits', 'can', 'vary', 'substantially', 'so...","['UpliftingNews', 'funny', 'movies', 'pics', '...","['the', 'internet', 'has', 'ruined', 'me', 'wh...","['movies', 'sports']",...,[],[],[],[],[],[],[],[],[],[]


In [33]:
for col in td_features_df.columns:
    if col != 'created_utc':
        td_features_df[col] = td_features_df[col].apply(lambda x: ast.literal_eval(x))
td_features_df

Unnamed: 0,created_utc,sub_t-1,text_t-1,sub_t-2,text_t-2,sub_t-3,text_t-3,sub_t-4,text_t-4,sub_t-5,...,sub_t-44,text_t-44,sub_t-45,text_t-45,sub_t-46,text_t-46,sub_t-47,text_t-47,sub_t-48,text_t-48
-Beth-,2016-03-18 14:12:41,"[AdviceAnimals, wow, hearthstone, ImGoingToHel...","[why, would, they, be, asked, directly, by, a,...","[CringeAnarchy, sadcringe, atheism, relationsh...","[well, it, was, a, huge, issue, at, the, time,...","[cringepics, gaming, MapPorn, CringeAnarchy, S...","[this, whole, thread, baffled, me, until, i, r...","[CringeAnarchy, AdviceAnimals, gaming, pcmaste...","[this, is, so, spot, on, ive, seen, so, many, ...","[BlackPeopleTwitter, gaming, AdviceAnimals, vi...",...,[],[],[],[],[],[],[],[],[],[]
-Oberlander,2016-03-17 20:18:45,"[MadeMeSmile, explainlikeimfive, battlefield_4...","[you, dont, know, what, uuranometria, has, don...","[explainlikeimfive, Showerthoughts, chemicalre...","[both, tos, and, tng, have, been, remastered, ...","[chemicalreactiongifs, AskReddit, Battlefield,...","[the, coldest, natural, place, in, the, univer...","[mildlyinteresting, tifu, Nordiccountries, shi...","[so, these, stairs, will, make, a, second, flo...","[shittyideas, AskReddit, space, mildlyinterest...",...,[],[],[],[],[],[],[],[],[],[]
-Shank-,2016-02-28 10:45:58,"[leagueoflegends, news, FortWorth, KotakuInAct...","[sleeper, pick, in, competitive, has, a, doubl...","[leagueoflegends, AskReddit, Conservative, Kot...","[crumbzz, is, definitely, playing, like, the, ...","[leagueoflegends, Games, Dallas, KotakuInActio...","[did, you, get, a, onehit, body, kill, through...","[leagueoflegends, KotakuInAction, funny, PS4, ...","[twitch, chat, knowingly, looks, for, those, l...","[KotakuInAction, leagueoflegends, DotA2, Gamin...",...,[],[],[],[],[],[],[],[],[],[]
05banks,2016-02-17 09:46:38,"[soccer, food, AskReddit, gifs, me_irl, britis...","[i, mean, for, me, id, play, rooney, as, a, a,...","[funny, Gunners, AdviceAnimals, soccer, Tumblr...","[people, who, suffered, fgm, wouldnt, know, wh...","[worldnews, soccer, funny, BlackPeopleTwitter,...","[no, hes, not, not, in, this, context, hes, no...","[AskReddit, BlackPeopleTwitter, Gunners, funny...","[american, accents, sound, camp, to, me, in, t...","[soccer, unitedkingdom, AskReddit, me_irl, vid...",...,[],[],[trees],"[its, awful, we, call, it, pulling, a, whitey,...",[],[],[],[],[],[]
14000_calories_later,2016-05-09 21:10:58,"[GetMotivated, ImGoingToHellForThis, sports, F...","[nope, billionaire, ufc, professional, wrestli...","[GetMotivated, television, gifs, pics, gaming,...","[then, to, hold, all, things, equal, a, bear, ...","[personalfinance, GetMotivated, listentothis]","[audits, can, vary, substantially, sometimes, ...","[UpliftingNews, funny, movies, pics, AskReddit...","[the, internet, has, ruined, me, why, do, i, f...","[movies, sports]",...,[],[],[],[],[],[],[],[],[],[]
1618allTheThings,2016-05-02 21:44:24,[],[],[],[],[],[],"[chomsky, pics, startrek]","[thank, you, and, yes, like, you, i, skimmed, ...",[],...,"[China, travel, Calgary, cats, malehairadvice,...","[its, long, life, btw, japan, but, asia, in, g...","[Calgary, realasians, China, nsfw, Fitness, fu...","[whats, stopping, him, of, making, all, scoote...","[ADHD, Fitness, AskReddit, IAmA, BipolarReddit]","[wear, a, blazersports, jacket, try, bcaa, pun...","[nsfw, China, pics]","[flawlessvictory, 2nd, paragraph, of, edit, 2y...",[AskReddit],"[you, sir, get, an, fine, up, vote, as, i, was..."
1c3b3rg,2016-03-15 23:04:46,"[guns, Justrolledintotheshop, KidneyStones, ca...","[better, call, saul, if, apple, made, cars, th...","[AskReddit, Glocks, worldnews, MosinNagant, ti...","[my, clients, are, full, of, them, can, you, r...","[popping, AskReddit, WTF, Glocks, cincinnati, ...","[all, popping, videos, should, have, gnarly, s...","[CowboyAction, KidneyStones, instant_regret]","[do, you, reload, no, problem, pm, me, if, you...","[MosinNagant, guns, TooMeIrlForMeIrl]",...,[],[],[],[],[],[],[],[],[],[]
1duke1522,2016-05-13 01:51:26,"[asoiaf, sex, worldnews, politics, gaming, gam...","[exactly, what, i, meant, with, rickon, it, co...","[gaming, gifs, worldnews, MMORPG, sex, movies,...","[to, see, your, penis, go, lift, weights, sorr...","[gameofthrones, politics, nsfw, gifs, blackdes...","[already, known, from, the, books, one, candid...",[worldnews],"[brainwashed, idiot, here, dont, bother]",[AdviceAnimals],...,[],[],[],[],[seduction],"[fag, drinking, in, college, is, one, of, the,...",[],[],[],[]
1lon3lycubone,2016-05-07 21:57:57,"[FiftyFifty, wifesharing, DirtySnapchat, leggi...","[ah, the, old, reddit, switcha, fuck, it, im, ...","[IAmA, wifesharing]","[can, you, show, us, the, movie, clip, that, w...",[shittyaskreddit],"[im, 20, and, im, an, orkin, man]",[hattiesburg],"[no, bernie, sanders, is, an, idiot]","[StarWars, nsfw_gifs, Blowjobs, hattiesburg, A...",...,"[WTF, fffffffuuuuuuuuuuuu, AskReddit, zelda, p...","[i, did, this, four, years, ago, and, complete...","[AskReddit, gentlemanboners, gifs, Fallout, to...","[nothing, good, is, going, to, happen, today, ...","[trees, fffffffuuuuuuuuuuuu, AskReddit, nsfw_g...","[hey, op, you, wouldnt, happen, to, be, from, ...","[zelda, pics, funny, circlejerk, Minecraft, As...","[lol, thanks, man, i, really, dont, post, much...","[WTF, AskReddit, assassinscreed, gaming, video...","[nor, did, you, take, this, picture, public, p..."
1w1w1w1w1,2016-05-13 08:19:51,"[superhostile, AskReddit, Firearms, battlefiel...","[that, awesome, thanks, vechs, you, just, did,...","[AskReddit, rickandmorty, buildapc, Competitiv...","[well, you, know, you, shouldnt, judge, someon...","[NoMansSkyTheGame, runescape, swordartonline, ...","[yeah, i, agree, with, you, hype, really, does...","[Gameboy, HaloOnline, buildapc, AskReddit, 7da...","[looks, pretty, nice, i, want, to, make, one, ...","[buildapcsales, Gameboy, buildapc, GlobalOffen...",...,[],[],[],[],[],[],[],[],[],[]


In [38]:
td_features_df.to_csv('td_gensubreddit_feats.csv')

#### Preprocess sandersforpresident user data

In [None]:
#Load Data
col_types = {'author': str, 'body':str, 'subreddit':str, 'created_utc':int, 'score': int}
sdf_df = pd.read_csv('./data/user_histories/sfp_userhistdata_000000000000', dtype = col_types)
sdf_df = sdf_df.append(pd.read_csv('./data/user_histories/sfp_userhistdata_000000000001', dtype = col_types), ignore_index=True)

In [None]:
#Change Unix Time to DateTime 
sdf_df['created_utc'] = sdf_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))
sdf_df.head()

In [9]:
#Get list of authors and thier first post on r/the_donald
first_post_df = sdf_df[sdf_df['subreddit'] == 'SandersForPresident']http://localhost:8888/notebooks/Google%20Drive/2017%20-%20Fall/IS_6742_NLP/reddit_tracking/gen_subreddit_userfeatures.ipynb#
first_post_df = first_post_df.groupby(['author']).min().reset_index()

#Covert this to a dictionary for cleaner use later
first_post_dict =pd.DataFrame(first_post_df['created_utc'])
first_post_dict.index = first_post_df['author']
first_post_dict = first_post_dict.to_dict()['created_utc']

first_post_dict

{'blinktactics': Timestamp('2016-02-02 15:13:13'),
 'superfluousman1994': Timestamp('2016-02-17 11:39:54'),
 'ShieldsUp1124': Timestamp('2016-03-10 00:34:50'),
 'linkz016': Timestamp('2016-02-01 21:10:32'),
 'imperator285': Timestamp('2016-03-29 01:09:12'),
 'ActNaturally': Timestamp('2016-02-01 19:12:31'),
 'tdrules': Timestamp('2016-02-02 12:50:56'),
 'orchidelirium': Timestamp('2016-03-18 12:17:37'),
 'Jhudd5646': Timestamp('2016-06-10 19:51:01'),
 'throwawaythatisnew': Timestamp('2016-04-19 12:05:17'),
 'salad222777': Timestamp('2016-02-04 13:17:50'),
 'user0o7': Timestamp('2016-04-20 08:13:14'),
 'flapsin': Timestamp('2016-02-01 21:14:35'),
 'SammyKingwood': Timestamp('2016-02-04 19:21:00'),
 'Careless_Con': Timestamp('2016-05-10 13:09:55'),
 'mmccaskill': Timestamp('2016-02-26 12:25:38'),
 'flamesodeath69': Timestamp('2016-04-03 00:32:15'),
 'Teodorant1': Timestamp('2016-02-23 12:16:20'),
 'jack9lemmon': Timestamp('2016-02-12 11:24:19'),
 'PotentiallySarcastic': Timestamp('2016-0

In [10]:
from fighting_words_py3 import basic_sanitize 

sdf_features_df = first_post_df.copy()[['created_utc']]
sdf_features_df.index = list(first_post_df.copy()['author'])
for num_months in range(0,48): #48 months - (4 years)

    #Init columns to be filled in 
    sub_colname = 'sub_t' + str(-(num_months + 1)) 
    text_colname = 'text_t' + str(-(num_months + 1)) 

    sdf_features_df.loc[:,sub_colname] = list(first_post_df.copy()['author'])
    sdf_features_df.loc[:,text_colname] = list(first_post_df.copy()['author'])

    
    for author in tqdm(sdf_features_df.index):
        #Init author-specific time frame
        starttime = first_post_dict[author] - dt.timedelta(days = 30 * (num_months + 1))
        endtime = first_post_dict[author] - dt.timedelta(days = 30 * num_months)

        #Filter sdf_df for the above parameters
        filter_sdf_df = sdf_df.copy()[(sdf_df['created_utc'] < endtime) & (sdf_df['created_utc'] >= starttime) & (sdf_df['author'] == author)]

        #Check if filter_sdf_df is empty (no posts made in time frame)
        if len(filter_sdf_df) == 0: /
            sdf_features_df.loc[author,sub_colname] = str([])
            sdf_features_df.loc[author,text_colname] =  str([])
        
        else:
            ###Get where one participated on a subreddit
            sdf_features_df.loc[author,sub_colname] = str(list(filter_sdf_df['subreddit'].unique()))

            ###Get tokenized and preprocessed words
            filter_sdf_df.loc[:,'body'] = filter_sdf_df['body'].apply(lambda x: str(x))
            filter_sdf_df.loc[:,'body'] = filter_sdf_df['body'].apply(lambda x: x.replace(r"(","  ")) 
            filter_sdf_df.loc[:,'body'] = filter_sdf_df['body'].apply(lambda x: x.replace(r")","  ")) 

            #Group data s.t. authors/bodies are grouped together
            filter_sdf_df = filter_sdf_df.groupby('author')['body'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
            #This should result in 1 combined string per person (each person posts one "super post" made up of all their posts)

            #Import and utilize a function from Jack's script which roughly sanitizes input strings
            filter_sdf_df.loc[:,'body'] = filter_sdf_df['body'].apply(lambda x: basic_sanitize(x).split()) 

            #Replace all strings which start with "http" with a marker for hyperlinks
            sdf_features_df.loc[author,text_colname] =  str(list(filter_sdf_df['body'].apply(lambda x: [i if i[0:4] != 'http' else '<HYPERLINK>' for i in x])[0]))
sdf_features_df.to_csv('sdf_gensubreddit_feats.csv')
sdf_features_df.head()

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [18:11<00:00,  2.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [17:09<00:00,  1.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [18:12<00:00,  1.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [18:05<00:00,  1.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [17:00<00:00,  1.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [17:18<00:00,  1.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [18:10<00:00,  1.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [17:19<00:00,  1.86it/s]
100%|███████████████████████████████████

Unnamed: 0,created_utc,sub_t-1,text_t-1,sub_t-2,text_t-2,sub_t-3,text_t-3,sub_t-4,text_t-4,sub_t-5,...,sub_t-44,text_t-44,sub_t-45,text_t-45,sub_t-46,text_t-46,sub_t-47,text_t-47,sub_t-48,text_t-48
-Malachite,2016-03-16 06:01:07,"['leagueoflegends', 'smashbros']","['lots', 'of', 'people', 'if', 'not', 'everyon...","['smashbros', 'SSBPM', 'treeofsavior', 'league...","['sidebyside', 'unless', 'hbox', 'is', 'playin...","['smashbros', 'Steam']","['to', 'grab', 'in', 'brawl', 'you', 'first', ...","['treeofsavior', 'SSBM']","['should', 'have', 'been', 'named', 'shizuo', ...","['smashbros', 'treeofsavior', 'SSBPM']",...,[],[],[],[],[],[],[],[],[],[]
-SHOCKDIZZLE-,2016-02-21 16:55:34,"['nba', 'baseball', 'nfl', 'warriors', 'BlackP...","['vancouver', 'and', 'seattle', 'we', 'need', ...","['nba', 'nfl', 'baseball', 'warriors', 'gaming...","['not', 'surprised', 'at', 'all', 'that', 'wer...","['baseball', 'nba', 'warriors', 'pcmasterrace'...","['fuck', 'the', 'padres', 'brooks', 'conrads',...","['baseball', 'warriors', 'nba', 'SFGiants', 'p...","['uh', 'what', 'the', 'tomahawk', 'chop', 'is'...","['baseball', 'warriors', 'nba', 'SFGiants']",...,[],[],[],[],[],[],[],[],[],[]
-Seraph,2016-04-02 18:37:20,"['PoliticalDiscussion', 'politics', 'news', 'A...","['he', 'may', 'have', 'gotten', 'cruz', 'over'...","['PoliticalDiscussion', 'politics', 'news', 'g...","['wasnt', 'nevada', 'a', 'caucus', 'i', 'think...","['PoliticalDiscussion', 'politics', 'nottheoni...","['cause', 'you', 'know', 'the', 'man', 'man', ...","['politics', 'StarWars', 'Games', 'funny', 'pi...","['you', 'should', 'read', 'the', 'article', 'i...","['Showerthoughts', 'NoStupidQuestions', 'Crazy...",...,[],[],[],[],[],[],[],[],[],[]
-_Odd_-,2016-02-16 17:27:21,"['TumblrInAction', 'gaming', 'infp', 'Electric...","['and', 'lou', 'the', 'cop', 'shame', 'my', 'a...","['PickAnAndroidForMe', 'chemicalreactiongifs',...","['ive', 'got', 'the', 'pure', 'edition', 'i', ...","['infp', 'cringepics', 'funhaus', 'beards', 'A...","['i', 'stopped', 'being', 'turned', 'on', 'by'...","['TopGear', 'DontPanic', 'EqualAttraction', 'O...","['or', 'they', 'could', 'just', 'have', 'the',...","['WTF', 'Nootropics', 'AskReddit', 'infp', 'fu...",...,[],[],[],[],[],[],[],[],[],[]
-mattybatty-,2016-03-24 19:49:21,"['europe', 'Roadcam', 'Sneakers']","['yeah', 'i', 'mean', 'whenever', 'i', 'fly', ...","['europe', 'TreesSuckingAtThings']","['actually', 'the', 'irs', 'has', 'been', 'goi...","['offmychest', 'europe', 'travel', 'shubreddit...","['i', 'dont', 'think', 'the', 'sarcasm', 'dear...","['cocktails', 'AdviceAnimals', 'HailCorporate'...","['i', 'feel', 'like', 'everyone', 'myself', 'i...","['ChristmasMusic', 'europe', 'nfl', 'Jeopardy'...",...,[],[],[],[],[],[],[],[],[],[]


In [12]:
for col in sdf_features_df.columns:
    if col != 'created_utc':
        sdf_features_df[col] = sdf_features_df[col].apply(lambda x: ast.literal_eval(x))
sdf_features_df

ValueError: malformed node or string: ['leagueoflegends', 'smashbros']

In [16]:
for col in sdf_features_df.columns:
    if col != 'created_utc':
        print(sdf_features_df[col]['-Malachite'])


['leagueoflegends', 'smashbros']
['lots', 'of', 'people', 'if', 'not', 'everyone', 'is', 'having', 'this', 'problem', 'no', 'fix', 'yet', 'to', 'be', 'fair', 'if', 'you', 'played', 'any', 'of', 'sakurais', 'games', 'youd', 'see', 'he', 'havent', 'the', 'vaguest', 'clue', 'what', 'goes', 'into', 'game', 'design', 'especially', 'fighting', 'game', 'design', 'ahri', 'gatou', 'gozaimasu', 'i', 'had', 'this', 'problem', 'go', 'to', 'optionsgtgraphics', 'settings', 'under', 'basic', 'where', 'it', 'says', 'backend', 'theres', 'a', 'dropdown', 'menu', 'change', 'that', 'mine', 'was', 'set', 'to', 'opengl', 'setting', 'it', 'to', 'direct3d', 'fixed', 'it', 'let', 'me', 'know', 'if', 'it', 'helps', 'i', 'can', 'play', 'any', 'role', 'and', 'ive', 'found', 'champions', 'i', 'like', 'in', 'all', 'roles', 'i', 'suggest', 'you', 'do', 'the', 'same']
['smashbros', 'SSBPM', 'treeofsavior', 'leagueoflegends']
['sidebyside', 'unless', 'hbox', 'is', 'playing', 'lucario', 'hey', 'you', 'have', 'the', 've

In [14]:
sdf_features_df.to_csv('sdf_gensubreddit_feats.csv')

#### Generate features for subreddits

In [8]:
feats_df = pd.read_csv('td_gensubreddit_feats.csv', index_col=[0])
feats_df = feats_df.append(pd.read_csv('sdf_gensubreddit_feats.csv', index_col=[0]))

feats_df.index.name = 'author'

In [9]:
#Generate Labels really quickly
labels = [1] * 2000
labels.extend([0] * 2000)

labels = pd.DataFrame(labels, columns =['label'], index = feats_df.index)

labels.to_csv('labels.csv')

In [10]:
#Remove irrelevant features
for i,col in enumerate(feats_df.columns):
    if i % 2 == 0:
        del feats_df[col]
        
feats_df.head()

Unnamed: 0_level_0,sub_t-1,sub_t-2,sub_t-3,sub_t-4,sub_t-5,sub_t-6,sub_t-7,sub_t-8,sub_t-9,sub_t-10,...,sub_t-39,sub_t-40,sub_t-41,sub_t-42,sub_t-43,sub_t-44,sub_t-45,sub_t-46,sub_t-47,sub_t-48
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-Beth-,"['AdviceAnimals', 'wow', 'hearthstone', 'ImGoi...","['CringeAnarchy', 'sadcringe', 'atheism', 'rel...","['cringepics', 'gaming', 'MapPorn', 'CringeAna...","['CringeAnarchy', 'AdviceAnimals', 'gaming', '...","['BlackPeopleTwitter', 'gaming', 'AdviceAnimal...","['rickandmorty', 'news', 'funny', 'bestof']","['creepyPMs', 'videos', 'AdviceAnimals', 'pics...","['funny', 'videos', 'AskReddit', 'gaming', 'da...","['funny', 'announcements', 'AdviceAnimals', 'g...","['funny', 'pics', 'tifu', 'videos', 'TumblrInA...",...,[],[],[],[],[],[],[],[],[],[]
-Oberlander,"['MadeMeSmile', 'explainlikeimfive', 'battlefi...","['explainlikeimfive', 'Showerthoughts', 'chemi...","['chemicalreactiongifs', 'AskReddit', 'Battlef...","['mildlyinteresting', 'tifu', 'Nordiccountries...","['shittyideas', 'AskReddit', 'space', 'mildlyi...","['chemicalreactiongifs', 'battlefield_4', 'shi...","['AskReddit', 'space', 'tf2', 'shittyideas', '...","['AskReddit', 'gadgets', 'news', 'tifu', 'inte...","['KerbalSpaceProgram', 'MadeMeSmile', 'Warship...","['todayilearned', 'Ewwducational', 'shittyasks...",...,[],[],[],[],[],[],[],[],[],[]
-Shank-,"['leagueoflegends', 'news', 'FortWorth', 'Kota...","['leagueoflegends', 'AskReddit', 'Conservative...","['leagueoflegends', 'Games', 'Dallas', 'Kotaku...","['leagueoflegends', 'KotakuInAction', 'funny',...","['KotakuInAction', 'leagueoflegends', 'DotA2',...","['leagueoflegends', 'LeagueOfMeta', 'worldnews...","['DotA2', 'KotakuInAction', 'leagueoflegends',...","['leagueoflegends', 'KotakuInAction', 'nba', '...","['KotakuInAction', 'leagueoflegends', 'CircLoL...","['leagueoflegends', 'CircLoLjerk', 'RiotFreeLo...",...,[],[],[],[],[],[],[],[],[],[]
05banks,"['soccer', 'food', 'AskReddit', 'gifs', 'me_ir...","['funny', 'Gunners', 'AdviceAnimals', 'soccer'...","['worldnews', 'soccer', 'funny', 'BlackPeopleT...","['AskReddit', 'BlackPeopleTwitter', 'Gunners',...","['soccer', 'unitedkingdom', 'AskReddit', 'me_i...","['Gunners', 'todayilearned', 'BlackPeopleTwitt...","['avfc', 'Gunners', 'food', 'unitedkingdom', '...","['Gunners', 'unitedkingdom', 'avfc', 'soccer',...","['soccer', 'Gunners', 'avfc', 'unitedkingdom',...","['avfc', 'unitedkingdom', 'soccer', 'AskReddit...",...,[],[],[],[],[],[],['trees'],[],[],[]
14000_calories_later,"['GetMotivated', 'ImGoingToHellForThis', 'spor...","['GetMotivated', 'television', 'gifs', 'pics',...","['personalfinance', 'GetMotivated', 'listentot...","['UpliftingNews', 'funny', 'movies', 'pics', '...","['movies', 'sports']","['AskReddit', 'Fitness', 'videos', 'gifs', 'te...","['AskReddit', 'todayilearned', 'gifs', 'Fitnes...","['pics', 'food', 'videos', 'todayilearned', 'g...","['sports', 'justneckbeardthings']","['sports', 'movies', 'gifs', 'Showerthoughts',...",...,[],[],[],[],[],[],[],[],[],[]


In [11]:
#Transform strings back into lists

for col in feats_df.columns:
    feats_df[col] = feats_df[col].apply(lambda x: ast.literal_eval(x))
feats_df.head(2)

Unnamed: 0_level_0,sub_t-1,sub_t-2,sub_t-3,sub_t-4,sub_t-5,sub_t-6,sub_t-7,sub_t-8,sub_t-9,sub_t-10,...,sub_t-39,sub_t-40,sub_t-41,sub_t-42,sub_t-43,sub_t-44,sub_t-45,sub_t-46,sub_t-47,sub_t-48
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-Beth-,"[AdviceAnimals, wow, hearthstone, ImGoingToHel...","[CringeAnarchy, sadcringe, atheism, relationsh...","[cringepics, gaming, MapPorn, CringeAnarchy, S...","[CringeAnarchy, AdviceAnimals, gaming, pcmaste...","[BlackPeopleTwitter, gaming, AdviceAnimals, vi...","[rickandmorty, news, funny, bestof]","[creepyPMs, videos, AdviceAnimals, pics, WTF, ...","[funny, videos, AskReddit, gaming, dataisbeaut...","[funny, announcements, AdviceAnimals, gaming, ...","[funny, pics, tifu, videos, TumblrInAction, Ko...",...,[],[],[],[],[],[],[],[],[],[]
-Oberlander,"[MadeMeSmile, explainlikeimfive, battlefield_4...","[explainlikeimfive, Showerthoughts, chemicalre...","[chemicalreactiongifs, AskReddit, Battlefield,...","[mildlyinteresting, tifu, Nordiccountries, shi...","[shittyideas, AskReddit, space, mildlyinterest...","[chemicalreactiongifs, battlefield_4, shittyas...","[AskReddit, space, tf2, shittyideas, FloridaMa...","[AskReddit, gadgets, news, tifu, interestingas...","[KerbalSpaceProgram, MadeMeSmile, WarshipPorn,...","[todayilearned, Ewwducational, shittyaskscienc...",...,[],[],[],[],[],[],[],[],[],[]


In [13]:
feats_dict = dict()
for auth in tqdm(feats_df.index):
    feats_dict[auth] = dict()  
    for col in feats_df.columns:
        feats_dict[auth][col] = feats_df.loc[auth,col]
        


100%|████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:02<00:00, 1536.60it/s]


In [15]:
import pickle

pickle.dump(feats_dict, open('feats_dict.p','wb')) 

In [17]:
#Get a unique list of subreddits used 
sub_list = []
for col in feats_df.columns:
    subs = []
    for row in feats_df.index:
        subs.extend(feats_df.loc[row,col])
    sub_list.append(list(set(subs)))
    
list_lengths = np.cumsum([len(i) for i in sub_list], dtype='int64')
list_length_dict = dict()
for i,total in enumerate(list_lengths):
    list_length_dict[total] = i+1

list_length_dict

{6018: 1,
 11684: 2,
 17426: 3,
 23072: 4,
 28616: 5,
 34108: 6,
 39610: 7,
 45171: 8,
 50598: 9,
 55984: 10,
 61349: 11,
 66592: 12,
 71704: 13,
 76556: 14,
 81360: 15,
 86006: 16,
 90442: 17,
 94958: 18,
 99322: 19,
 103613: 20,
 107807: 21,
 111858: 22,
 115871: 23,
 119741: 24,
 123646: 25,
 127308: 26,
 130806: 27,
 134115: 28,
 137302: 29,
 140337: 30,
 143361: 31,
 146284: 32,
 149165: 33,
 152071: 34,
 154938: 35,
 157769: 36,
 160504: 37,
 163151: 38,
 165636: 39,
 167973: 40,
 170235: 41,
 172386: 42,
 174498: 43,
 176538: 44,
 178535: 45,
 180437: 46,
 182233: 47,
 183981: 48}

In [19]:
total_cols = sum([len(sub_list[i]) for i in range(48)])
subfeats_df = np.zeros((4000,total_cols))

sub_cols = []
for i,subreddittime in enumerate(sub_list):
    sub_cols.extend([sub + '_t-' + str(i+1) for sub in subreddittime])

#subfeats_df.columns = sub_cols
#subfeats_df.index = feats_df.index
#subfeats_df.head()

0.0

In [23]:
for j,subtime in tqdm(enumerate(sub_cols)):
    #Fix subreddit name
    sub = subtime.split('_t-')[0]
    
    #Get reference column name
    refcol = 'sub' + subtime[len(subtime.split('_t-')[0]):]
    
    column = []
    #Check if subreddit found in reference column for each author and put into subfeats_df
    for i,auth in enumerate(feats_df.index):
         subfeats_df[i][j] = feats_dict[auth][refcol].count(sub)

183981it [15:15, 200.98it/s]


In [27]:
pickle.dump(subfeats_df, open('subreddit_feats.p','wb')) 

OverflowError: cannot serialize a bytes object larger than 4 GiB

In [24]:
pickle.dump(sub_cols, open('subreddit_feats.p','wb')) 

(4000, 183981)

In [26]:
len(sub_cols)

183981