# Amazon Fine Food Review Analysis using Naive Bayes

In [2]:
%matplotlib inline

import sqlite3
import pandas as pd
import seaborn as sn
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from collections import Counter
from sklearn import cross_validation



In [3]:
# using the SQLite Table to read data.
con = sqlite3.connect('final_half.sqlite') 

In [4]:
sorted_data = pd.read_sql_query("""SELECT * FROM Reviews_half""", con)
labels_count = sorted_data['Score'].value_counts()
labels = sorted_data['Score']
print(labels_count)

positive    153513
negative     28573
Name: Score, dtype: int64


In [5]:
sorted_data.shape

(182086, 11)

In [6]:
sorted_data.head()         # Time is in ascending order which means the dataset is of Time Based slicing.

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,417839,451856,B00004CXX9,AIUWLEQ1ADEG5,Elizabeth Medina,0,0,positive,944092800,Entertainingl Funny!,Beetlejuice is a well written movie ..... ever...
1,346055,374359,B00004CI84,A344SMIA5JECGM,Vincent P. Ross,1,2,positive,944438400,A modern day fairy tale,"A twist of rumplestiskin captured on film, sta..."
2,346041,374343,B00004CI84,A1B2IZU1JLZA6,Wes,19,23,negative,948240000,WARNING: CLAMSHELL EDITION IS EDITED TV VERSION,"I, myself always enjoyed this movie, it's very..."
3,70688,76882,B00002N8SM,A32DW342WBJ6BX,Buttersugar,0,0,positive,948672000,A sure death for flies,I bought a few of these after my apartment was...
4,346141,374450,B00004CI84,ACJR7EQF9S6FP,Jeremy Robertson,2,3,positive,951523200,Bettlejuice...Bettlejuice...BETTLEJUICE!,What happens when you say his name three times...


# Bag of words

In [7]:
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(sorted_data['Text'].values)

In [8]:
final_counts.shape

(182086, 82343)

In [9]:
# split the data set into train and test
X_1, X_test, y_1, y_test = cross_validation.train_test_split(final_counts, labels, test_size=0.3, random_state=0)
# split the train data set into cross validation train and cross validation test
X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(X_1, y_1, test_size=0.3)

In [10]:
myList = list(range(1,11))

In [11]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score


cv_scores = []
for alpha in myList:
    nb = BernoulliNB(alpha=alpha)
    scores = cross_val_score(nb, X_tr, y_tr, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
print(cv_scores)

[0.86508930836408049, 0.85456495566019797, 0.84707809255023858, 0.84265083906963711, 0.84040926330852217, 0.84008421905099484, 0.8399497087527964, 0.8400618012957809, 0.84027475929029927, 0.84031961489957718]


In [12]:
# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_aplha = myList[MSE.index(min(MSE))]
print('\nThe optimal number of alpha is %d.' % optimal_aplha)


The optimal number of alpha is 1.


In [13]:
# instantiate learning model k = optimal_k
NB_optimal = BernoulliNB(alpha=optimal_aplha)

# fitting the model
NB_optimal.fit(X_tr, y_tr)

# predict the response
pred = NB_optimal.predict(X_test)

# evaluate accuracy
acc = accuracy_score(y_test, pred) * 100
print('\nThe accuracy of the NB classifier for k = %d is %f%%' % (optimal_aplha, acc))


The accuracy of the NB classifier for k = 1 is 86.726101%


In [16]:
neg_class_prob_sorted = NB_optimal.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted = NB_optimal.feature_log_prob_[1, :].argsort()
print(neg_class_prob_sorted)
print(pos_class_prob_sorted)
print('top negative words \n',np.take(count_vect.get_feature_names(), neg_class_prob_sorted[:10]))
print('top positive words \n',np.take(count_vect.get_feature_names(), pos_class_prob_sorted[:10]))

[41171 49363 49364 ..., 74648  5738 73802]
[82342 35525 35526 ..., 74648  5738 73802]
top negative words 
 ['intermittently' 'mizudashi' 'mizuki' 'mj' 'mjr' 'mkae' 'mke' 'mkt'
 'mla75001' 'mlfsd']
top positive words 
 ['ît' 'goshoptnt' 'gosicknic' 'gospel' 'gossip' 'titanic' 'gota' 'gotchas'
 'gottcha' 'tisk']


In [17]:
print(y_test.value_counts())
import collections
print(collections.Counter(pred))
print(y_test.size)

positive    46169
negative     8457
Name: Score, dtype: int64
Counter({'positive': 48052, 'negative': 6574})
54626


In [36]:
from sklearn.metrics import confusion_matrix
labels_conf = ['negative', 'positive']
conf_mat = confusion_matrix(y_test, pred,labels = labels_conf)
print(conf_mat)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print('tn - True Negavite Rate',tn)
print('fp - False Positive Rate',fp)
print('fn - False Negative Rate',fn)
print('tp - True Positive Rate',tp)

[[ 3890  4567]
 [ 2684 43485]]
tn - True Negavite Rate 3890
fp - False Positive Rate 4567
fn - False Negative Rate 2684
tp - True Positive Rate 43485


In [24]:
TPR = tp/(tp+fn)
print('True Positive Rate',TPR)
TNR = tn/(tn+fp)
print('True Negative Rate',TNR)
FPR = fp/(tn+fp)
print('False Positive Rate',FPR)
FNR = fn/(tp+fn)
print('False Negative Rate',FNR)

True Positive Rate 0.941865754077
True Negative Rate 0.459973986047
False Positive Rate 0.540026013953
False Negative Rate 0.0581342459226


In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print('precision_score',precision_score(y_test, pred,pos_label='positive'))
print('recall_score',recall_score(y_test, pred,pos_label='positive'))

precision_score 0.904957129776
recall_score 0.941865754077


In [99]:
f1 = f1_score(y_test, pred,pos_label='positive')
print(f1)

0.925219863573


# TF-IDF

In [26]:
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(sorted_data['Text'].values)
print(final_tf_idf.shape)

(182086, 1884182)


In [27]:
# split the data set into train and test
X_1_tf, X_test_tf, y_1_tf, y_test_tf = cross_validation.train_test_split(final_tf_idf, labels, test_size=0.3, random_state=0)
# split the train data set into cross validation train and cross validation test
X_tr_tf, X_cv_tf, y_tr_tf, y_cv_tf = cross_validation.train_test_split(X_1_tf, y_1_tf, test_size=0.3)

In [28]:
myList_tf = list(range(1,11))
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score


cv_scores_tf_idf = []
for alpha_tf in myList_tf:
    nb_tf = MultinomialNB(alpha=alpha_tf)
    scores_tf = cross_val_score(nb_tf, X_tr_tf, y_tr_tf, cv=10, scoring='accuracy')
    cv_scores_tf_idf.append(scores_tf.mean())
print(cv_scores_tf_idf)

[0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993, 0.84213535130182993]


In [29]:
# changing to misclassification error
MSE_tf = [1 - x for x in cv_scores_tf_idf]

# determining best k
optimal_aplha_tf = myList_tf[MSE_tf.index(min(MSE_tf))]
print('\nThe optimal number of alpha is %d.' % optimal_aplha_tf)


The optimal number of alpha is 1.


In [30]:
# instantiate learning model k = optimal_k
NB_optimal_tf = MultinomialNB(alpha=optimal_aplha_tf)

# fitting the model
NB_optimal_tf.fit(X_tr_tf, y_tr_tf)

# predict the response
pred_tf = NB_optimal_tf.predict(X_test_tf)

# evaluate accuracy
acc_tf = accuracy_score(y_test_tf, pred_tf) * 100
print('\nThe accuracy of the NB classifier for k = %d is %f%%' % (optimal_aplha_tf, acc_tf))
# print(NB_optimal.feature_log_prob_)
# print(NB_optimal.class_count_ )


The accuracy of the NB classifier for k = 1 is 84.518361%


In [31]:
neg_class_prob_sorted_tf = NB_optimal_tf.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted_tf = NB_optimal_tf.feature_log_prob_[1, :].argsort()
print(neg_class_prob_sorted_tf)
print(pos_class_prob_sorted_tf)
print(np.take(tf_idf_vect.get_feature_names(), neg_class_prob_sorted_tf[:10]))
print(np.take(tf_idf_vect.get_feature_names(), pos_class_prob_sorted_tf[:10]))

[ 942090 1189649 1189648 ...,   89472  860134 1620802]
[ 942090 1016312 1016324 ...,  860134   89472 1620802]
['liquid mothballs' 'overall meal' 'overall maybe' 'overall mass'
 'overall make' 'overall maintains' 'overall made' 'overall low'
 'overall loved' 'overall love']
['liquid mothballs' 'mild boring' 'mild burning' 'mild burnt' 'mild butter'
 'mild ca' 'mild candy' 'mild capuccino' 'mild category' 'mild chared']


In [32]:
from sklearn.metrics import confusion_matrix
conf_mat_tf = confusion_matrix(y_test_tf, pred_tf)
print(conf_mat_tf)
tn_tf, fp_tf, fn_tf, tp_tf = confusion_matrix(y_test_tf, pred_tf).ravel()
print('tn - True Negative Rate',tn_tf)
print('fp - False Positive Rate',fp_tf)
print('fn - False Negative Rate',fn_tf)
print('tp - True Positive Rate',tp_tf)

[[    0  8457]
 [    0 46169]]
tn - True Negative Rate 0
fp - False Positive Rate 8457
fn - False Negative Rate 0
tp - True Positive Rate 46169


In [34]:
TPR_tf = tp_tf/(tp_tf+fn_tf)
print('TPR - True Positive Rate',TPR_tf)
TNR_tf = tn_tf/(tn_tf+fp_tf)
print('TNR - True Negative Rate',TNR_tf)
FPR_tf = fp_tf/(tn_tf+fp_tf)
print('FPR - False Positive Rate',FPR_tf)
FNR_tf = fn_tf/(tp_tf+fn_tf)
print('FNR - False Negative Rate',FNR_tf)

TPR - True Positive Rate 1.0
TNR - True Negative Rate 0.0
FPR - False Positive Rate 1.0
FNR - False Negative Rate 0.0


In [110]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print('precision_score',precision_score(y_test_tf, pred_tf,pos_label='positive'))
print('recall_score',recall_score(y_test_tf, pred_tf,pos_label='positive'))

precision_score 0.845183612199
recall_score 1.0


In [35]:
f1_tf = f1_score(y_test_tf, pred_tf,pos_label='positive')
print(f1_tf)

0.916097028622
