# <center>The Data

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from IPython.display import Image, display
from IPython.core.display import HTML 

In [3]:
path = '../../../data/eskirk/data/'
files = !ls ../../../data/eskirk/data
pd.DataFrame(files[:6])

Unnamed: 0,0
0,Bern.csv
1,Bernie.csv
2,Clinton.csv
3,Donald.csv
4,Hillary.csv
5,Trump.csv


# <center>Parsing the Files

In [4]:
bern = pd.read_csv(path + files[0], encoding = 'ISO-8859-1', sep = ',', low_memory=True)
bernie = pd.read_csv(path + files[1], encoding = 'ISO-8859-1', sep = ',', low_memory=True)
clinton = pd.read_csv(path + files[2], encoding = 'ISO-8859-1', sep = ',', low_memory=True)
trump = pd.read_csv(path + files[3], encoding = 'ISO-8859-1', sep = ',', low_memory=True)
hillary = pd.read_csv(path + files[4], encoding = 'ISO-8859-1', sep = ',', low_memory=False)
donald = pd.read_csv(path + files[5], encoding = 'ISO-8859-1', sep = ',', low_memory=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
columns = ['timestamp', 'link', 'caption', 'network', 'likes']

trump = trump[columns]
donald = donald[columns]
bernie = bernie[columns]
bern = bern[columns]
clinton = clinton[columns]
hillary = hillary[columns]

# <center>Tidying the Data

In [6]:
clinton_memes = pd.concat([hillary, clinton])
clinton_memes['candidate'] = 'clinton'

sanders_memes = pd.concat([bernie, bern])
sanders_memes['candidate'] = 'sanders'

trump_memes = pd.concat([donald, trump])
trump_memes['candidate'] = 'trump'

In [7]:
capts1 = clinton_memes['caption']
capts1 = [c.replace('\\n', ' ') for c in capts1]
capts2 = sanders_memes['caption']
capts2 = [c.replace('\\n', ' ') for c in capts2]
capts3 = trump_memes['caption']
capts3 = [c.replace('\\n', ' ') for c in capts3]

clinton_memes['caption'] = capts1
sanders_memes['caption'] = capts2
trump_memes['caption'] = capts3

In [8]:
all_memes = pd.concat([clinton_memes, sanders_memes, trump_memes]).drop_duplicates()
memes_and_likes = all_memes[['likes', 'candidate', 'network']]
all_memes.shape

(35382, 6)

In [9]:
assert all_memes.likes.dtype == 'int64'
assert all_memes.candidate.dtype == 'O'
assert all_memes.network.dtype == 'O'
assert all_memes.caption.dtype == 'O'

# <center>The Clean DataFrame

In [10]:
all_memes

Unnamed: 0,timestamp,link,caption,network,likes,candidate
0,8/1/16 20:56,http://imgur.com/a/lNHED,"Son killed by country, in a in a war starte...",imgur,-20,clinton
1,9/5/16 13:51,http://imgur.com/a/Cm6ls,"TRUMP LOVES TRUMP, AND DOES WHAT HIS FOLLOWER...",imgur,-17,clinton
2,7/27/16 0:54,http://imgur.com/a/FRGrq,1h Why end racism when we can just end the c...,imgur,-16,clinton
3,7/8/16 4:12,http://imgur.com/a/zF2Bw,Do jokes about not noticing someone because t...,imgur,-16,clinton
4,10/20/16 7:34,http://imgur.com/a/giimL,HILLARY CLINTON SAYS YOU CAN KILL A BABY THE ...,imgur,-14,clinton
5,10/22/16 5:10,http://imgur.com/a/tPNT6,I Liked p Message Save More A Write a comme...,imgur,-13,clinton
6,9/24/16 21:13,http://imgur.com/6wA1BRb,Hillary couldn't even take care of her husband...,imgur,-12,clinton
7,7/26/16 19:45,http://imgur.com/a/R5Iqq,Yesterday at 09:15. There will never be a con...,imgur,-12,clinton
8,10/20/16 13:15,http://imgur.com/tz9ej1D,LIVE LIVE DOVALLO TRE IS HILLARY (Al ATO Su...,imgur,-11,clinton
9,6/8/16 18:43,http://imgur.com/IpprlC4,AMI THE ONY ONE THAT REALIZES WHAT WILL HAPPE...,imgur,-11,clinton


# <center>Create Dummies for Classification

In [11]:
networks = list(all_memes['network'].unique())
networks

['imgur', 'facebook', 'instagram', 'twitter']

In [12]:
network = all_memes['network']
candidate = all_memes['candidate']

network_dummie = pd.get_dummies(network, prefix='network')
candidate_dummie = pd.get_dummies(candidate, prefix='candidate')

In [13]:
candidate_dummie = candidate_dummie[['candidate_clinton', 'candidate_trump']]
network_dummie = network_dummie[['network_instagram', 'network_twitter', 'network_facebook']]
all_memes = pd.concat([all_memes, network_dummie, candidate_dummie], axis=1)
all_memes

Unnamed: 0,timestamp,link,caption,network,likes,candidate,network_instagram,network_twitter,network_facebook,candidate_clinton,candidate_trump
0,8/1/16 20:56,http://imgur.com/a/lNHED,"Son killed by country, in a in a war starte...",imgur,-20,clinton,0,0,0,1,0
1,9/5/16 13:51,http://imgur.com/a/Cm6ls,"TRUMP LOVES TRUMP, AND DOES WHAT HIS FOLLOWER...",imgur,-17,clinton,0,0,0,1,0
2,7/27/16 0:54,http://imgur.com/a/FRGrq,1h Why end racism when we can just end the c...,imgur,-16,clinton,0,0,0,1,0
3,7/8/16 4:12,http://imgur.com/a/zF2Bw,Do jokes about not noticing someone because t...,imgur,-16,clinton,0,0,0,1,0
4,10/20/16 7:34,http://imgur.com/a/giimL,HILLARY CLINTON SAYS YOU CAN KILL A BABY THE ...,imgur,-14,clinton,0,0,0,1,0
5,10/22/16 5:10,http://imgur.com/a/tPNT6,I Liked p Message Save More A Write a comme...,imgur,-13,clinton,0,0,0,1,0
6,9/24/16 21:13,http://imgur.com/6wA1BRb,Hillary couldn't even take care of her husband...,imgur,-12,clinton,0,0,0,1,0
7,7/26/16 19:45,http://imgur.com/a/R5Iqq,Yesterday at 09:15. There will never be a con...,imgur,-12,clinton,0,0,0,1,0
8,10/20/16 13:15,http://imgur.com/tz9ej1D,LIVE LIVE DOVALLO TRE IS HILLARY (Al ATO Su...,imgur,-11,clinton,0,0,0,1,0
9,6/8/16 18:43,http://imgur.com/IpprlC4,AMI THE ONY ONE THAT REALIZES WHAT WILL HAPPE...,imgur,-11,clinton,0,0,0,1,0


# <center>Create Training and Testing Data

In [14]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

captions = all_memes['caption']
network = all_memes['network']
candidate = all_memes['candidate']



>Split the data for two different training and testing sets, one for the network and one for the candidate.

In [15]:
# Network training and testing sets
# commented out for presentation

# Xn_train, Xn_test, yn_train, yn_test = train_test_split(captions, network, test_size = 0.3)

# Candidate training and testing sets
# Xc_train, Xc_test, yc_train, yc_test = train_test_split(captions, candidate, test_size = 0.3)

In [16]:
# np.save(file = 'terrible_memes/Xc_train', arr = Xc_train)
# np.save(file = 'terrible_memes/yc_train', arr = yc_train)
# np.save(file = 'terrible_memes/Xn_train', arr = Xn_train)
# np.save(file = 'terrible_memes/yn_train', arr = yn_train)

# np.save(file = 'terrible_memes/Xc_test', arr = Xc_test)
# np.save(file = 'terrible_memes/yc_test', arr = yc_test)
# np.save(file = 'terrible_memes/Xn_test', arr = Xn_test)
# np.save(file = 'terrible_memes/yn_test', arr = yn_test)

In [17]:
Xn_train = np.load(file = 'terrible_memes/Xn_train.npy')
yn_train = np.load(file = 'terrible_memes/yn_train.npy')
Xc_train = np.load(file = 'terrible_memes/Xc_train.npy')
yc_train = np.load(file = 'terrible_memes/yc_train.npy')
Xc_test = np.load(file = 'terrible_memes/Xc_test.npy')
yc_test = np.load(file = 'terrible_memes/yc_test.npy')
Xn_test = np.load(file = 'terrible_memes/Xn_test.npy')
yn_test = np.load(file = 'terrible_memes/yn_test.npy')

Xn_train = pd.Series(Xn_train)
yn_train = pd.Series(yn_train)
Xc_train = pd.Series(Xc_train)
yc_train = pd.Series(yc_train)
Xc_test = pd.Series(Xc_test)
yc_test = pd.Series(yc_test)
Xn_test = pd.Series(Xn_test)
yn_test = pd.Series(yn_test)




# <center>Create a Pipeline for the Model
> For this model, I will be using a TF-IDF to analyze the text and a pipeline that attaches it to a random forest classifier

In [18]:
# network_forest_model = make_pipeline(TfidfVectorizer(lowercase=True), 
#                                      RandomForestClassifier(class_weight='balanced'))

# candidate_forest_model = make_pipeline(TfidfVectorizer(lowercase=True, analyzer='word'), 
#                                        RandomForestClassifier(class_weight='balanced', 
#                                        random_state=0))

> Fit the candidate and network models to the data

In [19]:
# as exciting as it is to wait 3 minutes to fit the data to my model, let's skip over that

# network_forest_model.fit(Xn_train, yn_train)
# candidate_forest_model.fit_transform(Xc_train, yc_train)

> Load the previously saved models (fitting the models takes a loooong time)

In [20]:
from sklearn.externals import joblib

# save the model to a file
# joblib.dump(candidate_forest_model, 'terrible_memes/candidate.pkl') 
# joblib.dump(network_forest_model, 'terrible_memes/network.pkl') 

In [21]:
candidate_forest_model = joblib.load('terrible_memes/candidate.pkl')
network_forest_model = joblib.load('terrible_memes/network.pkl')

# <center>Categorize the Test Data

In [22]:
yc_forest_predict = candidate_forest_model.predict(Xc_test)
print('I can predict the candidate with an accuracy of:\n', accuracy_score(yc_test, yc_forest_predict))

I can predict the candidate with an accuracy of:
 0.773339613754


In [23]:
yn_predict = network_forest_model.predict(Xn_test)
print('I can predict the network with an accuracy of:\n', accuracy_score(yn_test, yn_predict))

I can predict the network with an accuracy of:
 0.766462552991


> What exactly is the output here?

In [24]:
pd.DataFrame(yc_forest_predict).head(10)

Unnamed: 0,0
0,trump
1,trump
2,clinton
3,trump
4,sanders
5,clinton
6,trump
7,trump
8,trump
9,trump


In [25]:
pd.DataFrame(yc_test).head(10)

Unnamed: 0,0
0,trump
1,trump
2,clinton
3,trump
4,sanders
5,clinton
6,trump
7,trump
8,trump
9,trump


# <center>Candidate Feature Importances

> Now comes the interesting part

In [26]:
from operator import itemgetter

importance = candidate_forest_model.named_steps['randomforestclassifier'].feature_importances_
feats = candidate_forest_model.named_steps['tfidfvectorizer'].get_feature_names()
vocab = candidate_forest_model.named_steps['tfidfvectorizer'].vocabulary_
important_features = zip(feats, importance)

candidate_importances = list(reversed((list(sorted(important_features, key=itemgetter(1))))))

> Network Feature Importances

In [27]:
ntwrk_importance = network_forest_model.named_steps['randomforestclassifier'].feature_importances_
ntwrk_feats = network_forest_model.named_steps['tfidfvectorizer'].get_feature_names()
ntwrk_vocab = network_forest_model.named_steps['tfidfvectorizer'].vocabulary_
ntwrk_important_features = zip(ntwrk_feats, ntwrk_importance)

network_importances = list(reversed((list(sorted(ntwrk_important_features, key=itemgetter(1))))))

> Importance of Features per Candidate

In [29]:
phrase = ['clinton', 'trump', 'bird', 'make america great again', 'her emails']
vectorized = candidate_forest_model.named_steps['tfidfvectorizer'].transform(phrase)
classes = candidate_forest_model.named_steps['randomforestclassifier'].classes_
probs = candidate_forest_model.named_steps['randomforestclassifier'].predict_proba(vectorized)
print(classes, '\n', probs)

['clinton' 'sanders' 'trump'] 
 [[ 1.   0.   0. ]
 [ 0.   0.   1. ]
 [ 0.3  0.4  0.3]
 [ 0.   0.   1. ]
 [ 0.7  0.2  0.1]]


In [30]:
df = pd.DataFrame(network_importances)
df.columns = ['phrase', 'importance']
df.head(10)

Unnamed: 0,phrase,importance
0,trump,0.017002
1,donald,0.011886
2,hillary,0.009592
3,clinton,0.008638
4,the,0.008583
5,to,0.005763
6,on,0.005407
7,you,0.005343
8,for,0.005303
9,when,0.004893


In [31]:
df2 = pd.DataFrame(candidate_importances)
df2.columns = ['phrase', 'importance']
df2.head(10)

Unnamed: 0,phrase,importance
0,bernie,0.109171
1,hillary,0.062727
2,trump,0.052319
3,clinton,0.044384
4,donald,0.031546
5,sanders,0.014464
6,bern,0.008739
7,matter,0.008017
8,the,0.0053
9,compare,0.00519


# [03-Visualizations](03-Visualizations.ipynb)