In [9]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import html5lib
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

In [95]:
def eval_con_mat(matrix):
    #use confusion matrix to compute accuracy (good calls divided by all calls made)
    print('Accuracy of {}%'.format(round(100*
                                         (matrix[1][1]+matrix[0][0])/
                                         (matrix[1][0]+matrix[0][1]+matrix[1][1]+matrix[0][0])),2))
    
    print('Sensitivity of {}%'.format(round(100*matrix[1][1]/(matrix[1][1] + matrix[1][0]),2)))
    print('Specificity of {}%'.format(round(100*matrix[0][1]/(matrix[0][0] + matrix[0][1]),2)))

In [2]:
#Read in a list of 2000+ positive words
pos_words_loc = ('https://github.com/gurkpet/Thinkful-Lessons/blob/master/'
             'Thinkful%202.2.7-Naive%20Bayes/Word%20Lists/positive-words.txt')
pos_words_list = pd.read_html(pos_words_loc, skiprows=35)

#clean up the read in by removing the first column of junk data
for row in pos_words_list:
   del row[0]    

#convert list into dataframe
pos_words = pd.DataFrame(pos_words_list[0])

In [3]:
list_pos_words = list(pos_words[1])

In [5]:
#read in amazon file
amzn_file_loc = ('https://raw.githubusercontent.com/gurkpet/Thinkful-Lessons/master/'
            'Thinkful%202.2.7-Naive%20Bayes/amazon_cells_labelled.txt')

amzn_dat = pd.read_csv(amzn_file_loc, delimiter= '\t', header=None)
#rename column headers
amzn_dat.columns = ['review', 'positive_review']
amzn_target = amzn_dat['positive_review']

In [6]:
for key in list_pos_words:
    amzn_dat[str(key)] = amzn_dat.review.str.contains(str(key), case = False)
amzn_dat['positive_review'] = (amzn_dat['positive_review'] == 1)
data_amzn = amzn_dat[list_pos_words]
amzn_target = amzn_dat['positive_review']

In [92]:
bnb = BernoulliNB()
bnb.fit(data_amzn, amzn_target)
pred = bnb.predict(data_amzn)
con_mat = confusion_matrix(amzn_target, pred)
eval_con_mat(con_mat)

Accuracy of 83.0%
Sensitivity of 75.8%
Specificity of 9.0%


In [34]:
#First iteration: 3 folds
amzn_fold1 = data_amzn[:333]
amzn_fold2 = data_amzn[333:666]
amzn_fold3 = data_amzn[666:]

#corresponding targets for 3 folds
amzn_fold1_targ = amzn_dat['positive_review'][:333]
amzn_fold2_targ = amzn_dat['positive_review'][333:666]
amzn_fold3_targ = amzn_dat['positive_review'][666:]

In [35]:
bnb_fold1 = bnb.fit(amzn_fold1, amzn_fold1_targ)

<h4>Using fold 1 as the training data and then test folds 2 and 3</h4>

In [98]:
fold1_pred = bnb_fold1.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 60.0%
Sensitivity of 27.27%
Specificity of 2.55%


In [77]:
fold2_pred = bnb_fold1.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)
eval_con_mat(con_mat)

Accuracy of 70.0%
Sensitivity of 77.71%
Specificity of 37.72%


In [78]:
fold3_pred = bnb_fold1.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
eval_con_mat(con_mat)

Accuracy of 71.0%
Sensitivity of 74.05%
Specificity of 32.39%


<h4>Using fold 2 as the training data and then test folds 1 and 3</h4>

In [80]:
#Seems there is some severe overfitting, lets continue the cross validation.
bnb_fold2 = bnb.fit(amzn_fold2, amzn_fold2_targ)

In [96]:
fold2_pred = bnb_fold2.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 63.0%
Sensitivity of 30.12%
Specificity of 4.19%


In [83]:
fold1_pred = bnb_fold2.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)

eval_con_mat(con_mat)

Accuracy of 72.0%
Sensitivity of 54.55%
Specificity of 8.28%


In [84]:
fold3_pred = bnb_fold2.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
eval_con_mat(con_mat)

Accuracy of 74.0%
Sensitivity of 49.37%
Specificity of 3.98%


<h4>Using fold 3 as the training data and then test folds 1 and 2</h4>

In [86]:
#Things look even WORSE using fold 2 for training.  Trying fold 3.
bnb_fold3 = bnb.fit(amzn_fold3, amzn_fold3_targ)

In [97]:
fold3_pred = bnb_fold3.predict(amzn_fold3)
con_mat = confusion_matrix(amzn_fold3_targ,fold3_pred)
print('This is a test of the Training Data on itself')
eval_con_mat(con_mat)

This is a test of the Training Data on itself
Accuracy of 69.0%
Sensitivity of 35.44%
Specificity of 0.0%


Well, to start off, fold 3 trained and tested against itself wasn't very good at all.

In [87]:
fold1_pred = bnb_fold3.predict(amzn_fold1)
con_mat = confusion_matrix(amzn_fold1_targ,fold1_pred)

eval_con_mat(con_mat)

Accuracy of 60.0%
Sensitivity of 27.27%
Specificity of 2.55%


In [89]:
fold2_pred = bnb_fold3.predict(amzn_fold2)
con_mat = confusion_matrix(amzn_fold2_targ,fold2_pred)

eval_con_mat(con_mat)

Accuracy of 63.0%
Sensitivity of 30.12%
Specificity of 4.19%


Fold three as a training set looks HORRENDOUS.  I think its safe to say that the features we have been using are very poor.