In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBRegressor
import lightgbm as lgb

In [2]:
training_data = pd.read_csv('full_data.csv', header=None, names=['Username', 'Keywords', 'Subreddits'])
training_data = training_data.iloc[:9000]

In [3]:
training_data

Unnamed: 0,Username,Keywords,Subreddits
0,jason8001,"insurance company,insurance,company,pay,back,n...","Insurance,AskReddit,Militaryfaq,legaladvice,Ve..."
1,sergelo,"musk has huge,people,part,make,huge motivation...","funny,WTF,nba,Minecraft,pics,AdviceAnimals,tod..."
2,whitmanlands,"people,white people,white,reddit,women,black p...","ShitRedditSays,TheBluePill,justneckbeardthings..."
3,longgamma,"n’t,people,india,yeah,lot,good,show,point,fuck...","india,TrueReddit,pcmasterrace,politics,asoiaf,..."
4,nerd42,"love love love,love,day,time,lot,love love,thi...","TrollXChromosomes,AskReddit,teenagers,adventur..."
5,woodlark14,"time,make,people,supreme commander,makes,kill,...","AskReddit,whowouldwin,CharacterRant,feedthebea..."
6,decaboniized,"people,game,trump,games,shit,idea josh,fuck,ti...","politics,trees,nba,videos,pcgaming,buildapc,PU..."
7,thespoil,"fallout,power,vault,people,emperor,ncr,make,ea...","Warhammer40k,falloutlore,40kLore,Warhammer,the..."
8,pissonreddit,"wife,shit,back,day,friend,days,people,time,pho...","AskReddit,todayilearned,Showerthoughts,StonerP..."
9,bitchperfect2,"time,people,n’t,made,back,pregnant,found,year,...","AskReddit,AskWomen,antiMLM,news,Tinder,offmych..."


In [4]:
training_data = training_data.drop('Username', axis=1)

In [5]:
training_data = training_data.dropna()

In [6]:
training_data['Keywords'] = training_data['Keywords'].apply(lambda x: x.split(','))
training_data['Subreddits'] = training_data['Subreddits'].apply(lambda x: x.split(','))

In [7]:
training_data['Keywords'] = training_data['Keywords'].apply(lambda x: x[:100])

In [8]:
len(training_data['Keywords'][0])

100

In [9]:
training_data['Subreddits'] = training_data['Subreddits'].apply(lambda x: x[:10])
print(training_data)

                                               Keywords  \
0     [insurance company, insurance, company, pay, b...   
1     [musk has huge, people, part, make, huge motiv...   
2     [people, white people, white, reddit, women, b...   
3     [n’t, people, india, yeah, lot, good, show, po...   
4     [love love love, love, day, time, lot, love lo...   
5     [time, make, people, supreme commander, makes,...   
6     [people, game, trump, games, shit, idea josh, ...   
7     [fallout, power, vault, people, emperor, ncr, ...   
8     [wife, shit, back, day, friend, days, people, ...   
9     [time, people, n’t, made, back, pregnant, foun...   
10    [people, coach bruce arians, tom brady fightin...   
11    [time, people, make, things, good, reddit, lov...   
12    [play, game, theme is horrible, myself, new th...   
13    [unzipped full site, stupidly small, never hea...   
14    [god, islam, muslim, people, world, religion, ...   
15    [game, games, people, good, dark souls, souls,... 

In [10]:
features_list = []
labels_list = []
for i, row in training_data.iterrows():
    if len(row['Keywords']) == 100 and len(row['Subreddits']) == 10:
        features_list.append({'Keywords': row['Keywords']})
        labels_list.append({'Subreddits': row['Subreddits']})
training_features = pd.DataFrame(features_list)
training_labels = pd.DataFrame(labels_list)

In [11]:
training_labels = training_labels['Subreddits']

In [12]:
training_features

Unnamed: 0,Keywords
0,"[insurance company, insurance, company, pay, b..."
1,"[musk has huge, people, part, make, huge motiv..."
2,"[people, white people, white, reddit, women, b..."
3,"[n’t, people, india, yeah, lot, good, show, po..."
4,"[love love love, love, day, time, lot, love lo..."
5,"[time, make, people, supreme commander, makes,..."
6,"[people, game, trump, games, shit, idea josh, ..."
7,"[fallout, power, vault, people, emperor, ncr, ..."
8,"[wife, shit, back, day, friend, days, people, ..."
9,"[time, people, n’t, made, back, pregnant, foun..."


In [13]:
training_features.shape

(8703, 1)

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
one_hot = MultiLabelBinarizer()
training_labels = one_hot.fit_transform(training_labels)

In [14]:
training_features = training_features['Keywords']
one_hot_features = MultiLabelBinarizer()
training_features = one_hot_features.fit_transform(training_features)
training_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
train_features1, test_features1, train_labels1, test_labels1 = train_test_split(training_features, training_labels, test_size = 0.25, random_state=0, shuffle=False)

In [18]:
train_features1.shape
train_labels1.shape

(19630, 11323)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(train_features1, train_labels1)

In [19]:
test_labels = dtc.predict(test_features1)
one_hot.inverse_transform(test_labels)

[('AskNYC',
  'CatastrophicFailure',
  'CityPorn',
  'HistoryPorn',
  'WTF',
  'baseball',
  'boston',
  'nyc',
  'pics',
  'videos'),
 ('AskReddit',
  'Guildwars2',
  'PoliticalDiscussion',
  'europe',
  'iamverysmart',
  'news',
  'nottheonion',
  'politics',
  'relationships',
  'worldnews'),
 ('ABraThatFits',
  'AskReddit',
  'HIMYM',
  'ShitCosmoSays',
  'TrollXChromosomes',
  'TrollXWeddings',
  'gamegrumps',
  'gameofthrones',
  'jobs',
  'pics'),
 ('AdviceAnimals',
  'AskReddit',
  'LateStageCapitalism',
  'WTF',
  'buildapc',
  'funny',
  'pics',
  'raisedbynarcissists',
  'relationships',
  'todayilearned'),
 ('Aquariums',
  'AskReddit',
  'Jokes',
  'funny',
  'gaming',
  'gardening',
  'lotr',
  'pics',
  'proper',
  'stopdrinking'),
 ('Music',
  'Tinder',
  'asoiaf',
  'drums',
  'gameofthrones',
  'leagueoflegends',
  'pokemongo',
  'relationships',
  'summonerschool',
  'thewalkingdead'),
 ('BleachBraveSouls',
  'DestinyFashion',
  'DestinyTheGame',
  'PokemonPlaza',
  '

In [20]:
test_features_unencoded = one_hot_features.inverse_transform(test_features1)

In [24]:
test_features_unencoded

[('agree',
  'agreement',
  'ancap',
  'apparently',
  'area',
  'arguments',
  'article',
  'authority rewiring',
  'book',
  'california',
  'california correctional supervisors',
  'california district attorneys',
  'california police chiefs',
  'california state',
  'california state sheriffs',
  'case',
  'claim',
  'colorado',
  'communists',
  'completely',
  'crime',
  'end',
  'europa league michael',
  'fact',
  'federal',
  'find',
  'free',
  'fucking',
  'fucking things',
  'god',
  'good',
  'government',
  'group',
  'historical',
  'historical revolutions',
  'huemer',
  'huge',
  'idea',
  'ideas',
  'kind',
  'land',
  'law',
  'league',
  'league michael',
  'league michael huemer',
  'leave',
  'libertarians',
  'live',
  'lives',
  'long',
  'lot',
  'love',
  'made',
  'make',
  'mentions private prisons',
  'michael huemer',
  'money',
  'nice',
  'n’t',
  'part',
  'pay',
  'people',
  'people call obama',
  'person',
  'point',
  'police',
  'power',
  'pretty'

In [21]:
test_labels_unencoded = one_hot.inverse_transform(test_labels)

In [22]:
test_labels1_unencoded = one_hot.inverse_transform(test_labels1)

In [25]:
data = {'Keywords':test_features_unencoded, 'Subreddits Commented In': test_labels1_unencoded, 'SubSuggestions':test_labels_unencoded} 
subsuggestions_csv = pd.DataFrame(data)

In [28]:
test_features_unencoded

[('',
  'animals',
  'bernie',
  'bernie sanders',
  'call',
  'clinton',
  'comments',
  'edit',
  'england',
  'fuck',
  'fucking',
  'games',
  'good',
  'gove',
  'harry potter',
  'hillary',
  'hillary clinton',
  'jeremeo',
  'life',
  'love',
  'make',
  'man',
  'mildly frustrating',
  'nove',
  'people',
  'point',
  'portillo',
  'potter',
  'south carolina',
  'star',
  'star wars',
  'til',
  'time',
  'time hillary steals',
  'win',
  'world',
  'yeah',
  'years'),
 ('',
  'american',
  'american people',
  'americans',
  'black',
  'black friday',
  'cards',
  'country',
  'deck',
  'democrats',
  'draft',
  'finally received',
  'find',
  'government',
  'great',
  'great things trump',
  'hear president trump',
  'living guildpact find',
  'make',
  'make american great',
  'matches',
  'mazda',
  'optimized for yahoo',
  'parking',
  'people',
  'playoffs bitch',
  'president',
  'president trump',
  'president trump speak',
  'republicans',
  'sacramento state',
  'sn

In [26]:
subsuggestions_csv.to_csv('SubSuggestions_multilabelencoding.csv', encoding='utf-8', index = False)

In [21]:
import pickle
filename = 'multilabelbinarizer_features.sav'
joblib.dump(one_hot_features, open(filename, 'wb'))

In [22]:
import pickle
filename = 'multilabelbinarizer_labels.sav'
joblib.dump(one_hot, open(filename, 'wb'))

In [20]:
import joblib
filename = 'decision_tree_model2.sav'
joblib.dump(dtc, open(filename, 'wb'))