In [39]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import html5lib
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

In [40]:
def eval_con_mat(matrix):
    #use confusion matrix to compute accuracy (good calls divided by all calls made)
    print('Accuracy of {}%'.format(round(100*
                                         (matrix[1][1]+matrix[0][0])/
                                         (matrix[1][0]+matrix[0][1]+matrix[1][1]+matrix[0][0])),2))
    

    print('Sensitivity of {}%'.format(round(100*matrix[1][1]/(matrix[1][1] + matrix[1][0]),2)))
    print('Specificity of {}%'.format(round(100*matrix[0][0]/(matrix[0][0] + matrix[0][1]),2)))
    

In [41]:
#Read in a list of 2000+ positive words
pos_words_loc = ('https://github.com/gurkpet/Thinkful-Lessons/blob/master/'
             'Thinkful%202.2.7-Naive%20Bayes/Word%20Lists/positive-words.txt')
pos_words_list = pd.read_html(pos_words_loc, skiprows=35)

#clean up the read in by removing the first column of junk data
for row in pos_words_list:
   del row[0]    

#convert list into dataframe
pos_words = pd.DataFrame(pos_words_list[0])

In [42]:
list_pos_words = list(pos_words[1])

In [43]:
#read in amazon file
amzn_file_loc = ('https://raw.githubusercontent.com/gurkpet/Thinkful-Lessons/master/'
            'Thinkful%202.2.7-Naive%20Bayes/amazon_cells_labelled.txt')

df = pd.read_csv(amzn_file_loc, delimiter= '\t', header=None)
#rename column headers
amzn_dat = df.copy()
amzn_dat.columns = ['review', 'positive_review']
amzn_target = amzn_dat['positive_review']

In [44]:
for key in list_pos_words:
    amzn_dat[str(key)] = amzn_dat.review.str.contains(str(key), case = False)
amzn_dat['positive_review'] = (amzn_dat['positive_review'] == 1)
data_amzn = amzn_dat[list_pos_words]
amzn_target = amzn_dat['positive_review']

<h1> Iteration 1 - Train and test entire dataset with positive keywords</h1>

In [45]:
bnb = BernoulliNB()
bnb.fit(data_amzn, amzn_target)
pred = bnb.predict(data_amzn)
con_mat = confusion_matrix(amzn_target, pred)
eval_con_mat(con_mat)

Accuracy of 83.0%
Sensitivity of 75.8%
Specificity of 91.0%


<h1> Iteration 2- Cross validation of positive keywords</h1>

In [46]:
#First iteration: 3 folds
amzn_fold1 = data_amzn[:333]
amzn_fold2 = data_amzn[333:666]
amzn_fold3 = data_amzn[666:]

#corresponding targets for 3 folds
amzn_fold1_targ = amzn_dat['positive_review'][:333]
amzn_fold2_targ = amzn_dat['positive_review'][333:666]
amzn_fold3_targ = amzn_dat['positive_review'][666:]

In [47]:
bnb_fold1 = bnb.fit(amzn_fold1, amzn_fold1_targ)

<h4>Using fold 1 as the training data and then test folds 2 and 3</h4>

In [48]:
fold1_pred = bnb_fold1.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 85.0%
Sensitivity of 92.05%
Specificity of 76.43%


In [49]:
fold2_pred = bnb_fold1.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)
eval_con_mat(con_mat)

Accuracy of 70.0%
Sensitivity of 77.71%
Specificity of 62.28%


In [50]:
fold3_pred = bnb_fold1.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
eval_con_mat(con_mat)

Accuracy of 71.0%
Sensitivity of 74.05%
Specificity of 67.61%


<h4>Using fold 2 as the training data and then test folds 1 and 3</h4>

In [51]:
#Seems there is some severe overfitting, lets continue the cross validation.
bnb_fold2 = bnb.fit(amzn_fold2, amzn_fold2_targ)

In [52]:
fold2_pred = bnb_fold2.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 79.0%
Sensitivity of 63.25%
Specificity of 94.61%


In [53]:
fold1_pred = bnb_fold2.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)

eval_con_mat(con_mat)

Accuracy of 72.0%
Sensitivity of 54.55%
Specificity of 91.72%


In [54]:
fold3_pred = bnb_fold2.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
eval_con_mat(con_mat)

Accuracy of 74.0%
Sensitivity of 49.37%
Specificity of 96.02%


<h4>Using fold 3 as the training data and then test folds 1 and 2</h4>

In [55]:
#Things look even WORSE using fold 2 for training.  Trying fold 3.
bnb_fold3 = bnb.fit(amzn_fold3, amzn_fold3_targ)

In [56]:
fold3_pred = bnb_fold3.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 69.0%
Sensitivity of 35.44%
Specificity of 100.0%


Well, to start off, fold 3 trained and tested against itself wasn't very good at all.

In [57]:
fold1_pred = bnb_fold3.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)

eval_con_mat(con_mat)

Accuracy of 60.0%
Sensitivity of 27.27%
Specificity of 97.45%


In [58]:
fold2_pred = bnb_fold3.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)

eval_con_mat(con_mat)

Accuracy of 63.0%
Sensitivity of 30.12%
Specificity of 95.81%


Fold three as a training set looks HORRENDOUS.  I think its safe to say that the features we have been using are very poor.  So maybe time to try another method to train the model.

<h1>Iteration 3- Train and test entire dataset with negative keywords</h1>

In [59]:
#Pulling in a list of negative keywords from someones github
neg_words_loc = 'https://raw.githubusercontent.com/williamgunn/SciSentiment/master/negative-words.txt'
neg_words_df = pd.read_csv(neg_words_loc, skiprows=35, header = None)
neg_words_list = neg_words_df[0]
neg_words_list= neg_words_list[neg_words_list!='bull****']
neg_words_list= neg_words_list[neg_words_list!='bull----']
neg_words_list= neg_words_list[neg_words_list!='f**k']

In [60]:
amzn_dat_neg = df.copy()
amzn_dat_neg.columns = ['review', 'positive_review']

In [61]:
for key in neg_words_list:
    amzn_dat_neg[str(key)] = amzn_dat_neg.review.str.contains(str(key), case = False)

In [62]:
amzn_dat_neg['positive_review'] = (amzn_dat_neg['positive_review'] == 1)

In [63]:
data_amzn_neg = amzn_dat_neg[neg_words_list]

In [64]:
amzn_target_neg = amzn_dat_neg['positive_review']

In [65]:
bnb = BernoulliNB()
bnb.fit(data_amzn_neg, amzn_target_neg)
pred = bnb.predict(data_amzn_neg)
con_mat = confusion_matrix(amzn_target_neg, pred)
eval_con_mat(con_mat)

Accuracy of 76.0%
Sensitivity of 97.2%
Specificity of 55.2%


Seems using negative keywords is slightly less effective when training against all the data, but lets doing some cross validation.
<h1>Iteration 4- Cross Validation using negative keywords</h1>

In [66]:
#First iteration: 3 folds
amzn_fold1_neg = data_amzn_neg[:333]
amzn_fold2_neg = data_amzn_neg[333:666]
amzn_fold3_neg = data_amzn_neg[666:]

#corresponding targets for 3 folds
amzn_fold1_targ_neg = amzn_target_neg[:333]
amzn_fold2_targ_neg = amzn_target_neg[333:666]
amzn_fold3_targ_neg = amzn_target_neg[666:]

In [67]:
bnb_fold1 = bnb.fit(amzn_fold1_neg, amzn_fold1_targ_neg)

In [68]:
fold1_pred = bnb_fold1.predict(amzn_fold1_neg)
con_mat = confusion_matrix(amzn_fold1_targ_neg,fold1_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)
print(con_mat)

This is a test of the Training Data on itself
Accuracy of 55.0%
Sensitivity of 100.0%
Specificity of 5.1%
[[  8 149]
 [  0 176]]


In [69]:
fold1_pred = bnb_fold1.predict(amzn_fold2_neg)
con_mat = confusion_matrix(amzn_fold2_targ_neg, fold2_pred)
eval_con_mat(con_mat)

Accuracy of 63.0%
Sensitivity of 30.12%
Specificity of 95.81%


In [70]:
fold1_pred = bnb_fold1.predict(amzn_fold3_neg)
con_mat = confusion_matrix(amzn_fold3_targ_neg, fold3_pred)
eval_con_mat(con_mat)

Accuracy of 69.0%
Sensitivity of 35.44%
Specificity of 100.0%


In [73]:
bnb_fold2 = bnb.fit(amzn_fold2_neg, amzn_fold2_targ_neg)

In [74]:
fold2_pred = bnb_fold2.predict(amzn_fold2_neg)
con_mat = confusion_matrix(amzn_fold2_targ_neg,fold2_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)
print(con_mat)

This is a test of the Training Data on itself
Accuracy of 78.0%
Sensitivity of 90.96%
Specificity of 64.67%
[[108  59]
 [ 15 151]]


In [78]:
fold1_pred = bnb_fold2.predict(amzn_fold1_neg)
con_mat = confusion_matrix(amzn_fold1_targ_neg, fold1_pred)
eval_con_mat(con_mat)
print(con_mat)

Accuracy of 64.0%
Sensitivity of 84.09%
Specificity of 41.4%
[[ 65  92]
 [ 28 148]]


In [79]:
fold3_pred = bnb_fold2.predict(amzn_fold3_neg)
con_mat = confusion_matrix(amzn_fold3_targ_neg,fold3_pred)
eval_con_mat(con_mat)
print(con_mat)

Accuracy of 62.0%
Sensitivity of 81.65%
Specificity of 44.89%
[[ 79  97]
 [ 29 129]]


In [80]:
bnb_fold3 = bnb.fit(amzn_fold3_neg, amzn_fold3_targ_neg)

In [81]:
fold3_pred = bnb_fold3.predict(amzn_fold3_neg)
con_mat = confusion_matrix(amzn_fold3_targ_neg,fold3_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)
print(con_mat)

This is a test of the Training Data on itself
Accuracy of 54.0%
Sensitivity of 2.53%
Specificity of 100.0%
[[176   0]
 [154   4]]


In [82]:
fold1_pred = bnb_fold3.predict(amzn_fold1_neg)
con_mat = confusion_matrix(amzn_fold1_targ_neg,fold1_pred)
eval_con_mat(con_mat)
print(con_mat)

Accuracy of 46.0%
Sensitivity of 1.7%
Specificity of 96.18%
[[151   6]
 [173   3]]


In [83]:
fold2_pred = bnb_fold3.predict(amzn_fold2_neg)
con_mat = confusion_matrix(amzn_fold2_targ_neg,fold2_pred)
eval_con_mat(con_mat)
print(con_mat)

Accuracy of 51.0%
Sensitivity of 3.01%
Specificity of 98.8%
[[165   2]
 [161   5]]


Pretty much every iteration seems like an overfit.  They are all pretty innaccurate when cross-validated and always less accurate on the test data than the training data.

<h1>I think it would be best to go over these questions with you as I don't really know how to answer.  Especially the which features seem best question</h1>
Which seem to perform the best? Why?
What features seemed to be most impactful to performance?