In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')

In [2]:
# Grab and process the raw data.
data_path = ('./amazon_cells_labelled.txt')
sentiment_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sentiment_raw.columns = ['review', 'sentiment']

In [23]:
sentiment_raw.head()

Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [86]:
s1_df = sentiment_raw.copy() # make the first copy

In [87]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [88]:
bigram = ['do not', 'no problem', 'the best', 'loved it','the best', 'the greatest','so good','so great', 'not good', 'not bad', 'really great','really bad', 'really good', "very good", 
          'not impressed','very impressive', 'well made', 'badly made', 'not great', 'too big', 'too small', 
          'very poor', 'not working','working good', "doesn't work", "didn't work", "doesn't fit", 
          "wouldn't recommend", 'would recommend', 'not nice', 'not working','not easy', 'not happy']

unigram = ['problem','broken','good', 'great', 'garbage', 'junk', 'disappointed', 'love', 'not', 'ugly', 'beautiful', 
           'pleased', 'very', 'waste','flaw', "doesn't", 'avoid', 'back', 'fast', 'slow', 'useless', 'useful', 
           'quality', 'well', 'beware', 'recommend', 'nice','quality', 'cool', 'satisfied', 'easy', 'fine', 
           'never', 'negative', 'lose', 'awful', 'lousy', 'unsatisfactory','satisfactory', 'mistake', 'happy', 
           'sad']

* Created new single word list (unigram) and double word list (bigram)

* Let's create the first model using unigram

## First Model using Unigram ##

In [89]:
for word in unigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s1_df[str(word)] = s1_df.review.str.contains(
       str(word) ,
        case=False
    )
# ' ' + str(word) + ' '

In [90]:
s1_df.head()

Unnamed: 0,review,sentiment,problem,broken,good,great,garbage,junk,disappointed,love,...,never,negative,lose,awful,lousy,unsatisfactory,satisfactory,mistake,happy,sad
0,So there is no way for me to plug it in here i...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",1,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,1,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,1,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [91]:
data1 = s1_df[unigram]
target1 = s1_df['sentiment']

In [92]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb1 = BernoulliNB()

# Fit our model to the data.
bnb1.fit(data1, target1)

# Classify, storing the result in a new variable.
y_pred1 = bnb1.predict(data1)

num_data = data1.shape[0]
correct = (num_data - (target1 != y_pred1).sum())/num_data * 100

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data1.shape[0],
    (target1 != y_pred1).sum()
))

print("Accuracy = {:.2f}%".format(correct))

Number of mislabeled points out of a total 1000 points : 243
Accuracy = 75.70%


In [93]:
cvout1 = cross_val_score(bnb1, data1, target1, cv=5)
print(cvout1)
print("cv average is = {:.2f}%".format(cvout1.mean()*100))

[0.78  0.75  0.765 0.74  0.715]
cv average is = 75.00%


In [94]:
cm1 = confusion_matrix(target1, y_pred1)
print(cm1)
print("True Positive = {:.2f}%".format(cm1[1,1]/10))
print("False Positive = {:.2f}%".format(cm1[0,1]/10))
print("True Negative = {:.2f}%".format(cm1[0,0]/10))
print("False Negative = {:.2f}%".format(cm1[1,0]/10))
print("sensitivity or hit rate is {:.2f}%".format(cm1[1,1]/(cm1[1,0]+cm1[1,1])*100))
# sensitivity (recall) is the percentage of positives identified or TP/FN+TP
print("specificity or True Negative rate is {:.2f}%".format(cm1[0,0]/(cm1[0,0]+cm1[0,1])*100))

[[441  59]
 [184 316]]
True Positive = 31.60%
False Positive = 5.90%
True Negative = 44.10%
False Negative = 18.40%
sensitivity or hit rate is 63.20%
specificity or True Negative rate is 88.20%


## Second Model using Bigram ##

In [95]:
s2_df = sentiment_raw.copy() # make the second copy

In [96]:
for word in bigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s2_df[str(word)] = s2_df.review.str.contains(
       str(word) ,
        case=False
    )
# ' ' + str(word) + ' '

In [97]:
data2 = s2_df[bigram]
target2 = s2_df['sentiment']

In [98]:


# Instantiate our model and store it in a new variable.
bnb2 = BernoulliNB()

# Fit our model to the data.
bnb2.fit(data2, target2)

# Classify, storing the result in a new variable.
y_pred2 = bnb2.predict(data2)

num_data = data2.shape[0]
correct = (num_data - (target2 != y_pred2).sum())/num_data * 100

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target2 != y_pred2).sum()
))

print("Accuracy = {:.2f}%".format(correct))

Number of mislabeled points out of a total 1000 points : 457
Accuracy = 54.30%


In [99]:
cvout2 = cross_val_score(bnb2, data2, target2, cv=5)
print(cvout2)
print("cv average is = {:.2f}%".format(cvout2.mean()*100))

[0.55  0.53  0.545 0.54  0.53 ]
cv average is = 53.90%


In [101]:
cm2 = confusion_matrix(target2, y_pred2)
print(cm2)
print("True Positive = {:.2f}%".format(cm2[1,1]/10))
print("False Positive = {:.2f}%".format(cm2[0,1]/10))
print("True Negative = {:.2f}%".format(cm2[0,0]/10))
print("False Negative = {:.2f}%".format(cm2[1,0]/10))
print("sensitivity or hit rate is {:.2f}%".format(cm2[1,1]/(cm2[1,0]+cm2[1,1])*100))
# sensitivity (recall) is the percentage of positives identified or TP/FN+TP
print("specificity or True Negative rate is {:.2f}%".format(cm2[0,0]/(cm2[0,0]+cm2[0,1])*100))
# percentage of negatives identified

[[498   2]
 [455  45]]
True Positive = 4.50%
False Positive = 0.20%
True Negative = 49.80%
False Negative = 45.50%
sensitivity or hit rate is 9.00%
specificity or True Negative rate is 99.60%


## Third Model using Unigram with Bigram ##

In [102]:
s3_df = sentiment_raw.copy() # make the third copy

In [103]:
for word in bigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s3_df[str(word)] = s3_df.review.str.contains(
       str(word) ,
        case=False
    )
    
for word in unigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s3_df[str(word)] = s3_df.review.str.contains(
       str(word) ,
        case=False
    )

In [104]:
data3 = s3_df[unigram + bigram]
target3 = s3_df['sentiment']

In [105]:
# Instantiate our model and store it in a new variable.
bnb3 = BernoulliNB()

# Fit our model to the data.
bnb3.fit(data3, target3)

# Classify, storing the result in a new variable.
y_pred3 = bnb3.predict(data3)

num_data = data3.shape[0]
correct = (num_data - (target3 != y_pred3).sum())/num_data * 100

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data3.shape[0],
    (target3 != y_pred3).sum()
))

print("Accuracy = {:.2f}%".format(correct))

Number of mislabeled points out of a total 1000 points : 226
Accuracy = 77.40%


In [106]:
cvout3 = cross_val_score(bnb3, data3, target3, cv=5)
print(cvout3)
print("cv average is = {:.2f}%".format(cvout3.mean()*100))

[0.795 0.765 0.785 0.76  0.72 ]
cv average is = 76.50%


In [107]:
cm3 = confusion_matrix(target3, y_pred3)
print(cm3)
print("True Positive = {:.2f}%".format(cm3[1,1]/10))
print("False Positive = {:.2f}%".format(cm3[0,1]/10))
print("True Negative = {:.2f}%".format(cm3[0,0]/10))
print("False Negative = {:.2f}%".format(cm3[1,0]/10))
print("sensitivity or hit rate is {:.2f}%".format(cm3[1,1]/(cm3[1,0]+cm3[1,1])*100))
# sensitivity (recall) is the percentage of positives identified or TP/FN+TP
print("specificity or True Negative rate is {:.2f}%".format(cm3[0,0]/(cm3[0,0]+cm3[0,1])*100))
# percentage of negatives identified

[[443  57]
 [169 331]]
True Positive = 33.10%
False Positive = 5.70%
True Negative = 44.30%
False Negative = 16.90%
sensitivity or hit rate is 66.20%
specificity or True Negative rate is 88.60%


## Fourth Model with Unigrams and Multinomial Naive Bayes ##

In [115]:
s4_df = sentiment_raw.copy() # make the fourt copy

In [116]:
import re

In [117]:
for word in unigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s4_df[str(word)] = s4_df.review.str.count(
       str(word) ,
        flags=re.IGNORECASE
    )

In [119]:
s4_df.head()

Unnamed: 0,review,sentiment,problem,broken,good,great,garbage,junk,disappointed,love,...,never,negative,lose,awful,lousy,unsatisfactory,satisfactory,mistake,happy,sad
0,So there is no way for me to plug it in here i...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Good case, Excellent value.",1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Great for the jawbone.,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Tied to charger for conversations lasting more...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The mic is great.,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
data4 = s4_df[unigram]
target4 = s4_df['sentiment']

In [122]:
from sklearn.naive_bayes import MultinomialNB

In [123]:
# Instantiate our model and store it in a new variable.
bnb4 = MultinomialNB()

# Fit our model to the data.
bnb4.fit(data4, target4)

# Classify, storing the result in a new variable.
y_pred4 = bnb4.predict(data4)

num_data = data4.shape[0]
correct = (num_data - (target4 != y_pred4).sum())/num_data * 100

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data4.shape[0],
    (target4 != y_pred4).sum()
))

print("Accuracy = {:.2f}%".format(correct))

Number of mislabeled points out of a total 1000 points : 242
Accuracy = 75.80%


In [124]:
cvout4 = cross_val_score(bnb4, data4, target4, cv=5)
print(cvout4)
print("cv average is = {:.2f}%".format(cvout4.mean()*100))

[0.775 0.755 0.765 0.725 0.715]
cv average is = 74.70%


In [125]:
cm4 = confusion_matrix(target4, y_pred4)
print(cm4)
print("True Positive = {:.2f}%".format(cm4[1,1]/10))
print("False Positive = {:.2f}%".format(cm4[0,1]/10))
print("True Negative = {:.2f}%".format(cm4[0,0]/10))
print("False Negative = {:.2f}%".format(cm4[1,0]/10))
print("sensitivity or hit rate is {:.2f}%".format(cm4[1,1]/(cm4[1,0]+cm4[1,1])*100))
# sensitivity (recall) is the percentage of positives identified or TP/FN+TP
print("specificity or True Negative rate is {:.2f}%".format(cm4[0,0]/(cm4[0,0]+cm4[0,1])*100))
# percentage of negatives identified

[[443  57]
 [185 315]]
True Positive = 31.50%
False Positive = 5.70%
True Negative = 44.30%
False Negative = 18.50%
sensitivity or hit rate is 63.00%
specificity or True Negative rate is 88.60%


## Fifth Model using both Unigram and Bigram using Multinomial Bayes ##

In [127]:
s5_df = sentiment_raw.copy() # make the fifth copy

In [128]:
for word in unigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s5_df[str(word)] = s5_df.review.str.count(
       str(word) ,
        flags=re.IGNORECASE
    )
    
for word in bigram:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    s5_df[str(word)] = s5_df.review.str.count(
       str(word) ,
        flags=re.IGNORECASE
    )

In [129]:
data5 = s5_df[unigram + bigram]
target5 = s5_df['sentiment']

In [130]:
# Instantiate our model and store it in a new variable.
bnb5 = MultinomialNB()

# Fit our model to the data.
bnb5.fit(data5, target5)

# Classify, storing the result in a new variable.
y_pred5 = bnb5.predict(data5)

num_data = data5.shape[0]
correct = (num_data - (target5 != y_pred5).sum())/num_data * 100

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data5.shape[0],
    (target5 != y_pred5).sum()
))

print("Accuracy = {:.2f}%".format(correct))

Number of mislabeled points out of a total 1000 points : 222
Accuracy = 77.80%


In [131]:
cvout5 = cross_val_score(bnb5, data5, target5, cv=5)
print(cvout5)
print("cv average is = {:.2f}%".format(cvout5.mean()*100))

[0.79  0.765 0.785 0.74  0.73 ]
cv average is = 76.20%


In [132]:
cm5 = confusion_matrix(target5, y_pred5)
print(cm5)
print("True Positive = {:.2f}%".format(cm5[1,1]/10))
print("False Positive = {:.2f}%".format(cm5[0,1]/10))
print("True Negative = {:.2f}%".format(cm5[0,0]/10))
print("False Negative = {:.2f}%".format(cm5[1,0]/10))
print("sensitivity or hit rate is {:.2f}%".format(cm5[1,1]/(cm5[1,0]+cm5[1,1])*100))
# sensitivity (recall) is the percentage of positives identified or TP/FN+TP
print("specificity or True Negative rate is {:.2f}%".format(cm5[0,0]/(cm5[0,0]+cm5[0,1])*100))
# percentage of negatives identified

[[448  52]
 [170 330]]
True Positive = 33.00%
False Positive = 5.20%
True Negative = 44.80%
False Negative = 17.00%
sensitivity or hit rate is 66.00%
specificity or True Negative rate is 89.60%


## Conclusions ##

* Naive Bayes models do not easily suffer from Overfit

* Models will probably continue to improve with longer lists of words (Unigrams and Bigrams).

* Best model by a slim margin is model 5 (Unigrams + Bigrams with Multinomial NB).

* Multinomial NB model only slightly improves results

* Bigram NB model (model2) has extremely high True Negative Rate but low hit rate.

* We should probably vote model2 with model 5 for even better results.