In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

The data contains Amazon pet supply reviews and can be found here: http://jmcauley.ucsd.edu/data/amazon/

In [2]:
raw_data = pd.read_json('datafiles/reviews_Pet_Supplies_5.json',lines=True)

In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157836 entries, 0 to 157835
Data columns (total 9 columns):
asin              157836 non-null object
helpful           157836 non-null object
overall           157836 non-null int64
reviewText        157836 non-null object
reviewTime        157836 non-null object
reviewerID        157836 non-null object
reviewerName      156493 non-null object
summary           157836 non-null object
unixReviewTime    157836 non-null int64
dtypes: int64(2), object(7)
memory usage: 12.0+ MB


In [4]:
raw_data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1223000893,"[0, 0]",3,I purchased the Trilogy with hoping my two cat...,"01 12, 2011",A14CK12J7C7JRK,Consumer in NorCal,Nice Distraction for my cats for about 15 minutes,1294790400
1,1223000893,"[0, 0]",5,There are usually one or more of my cats watch...,"09 14, 2013",A39QHP5WLON5HV,Melodee Placial,Entertaining for my cats,1379116800
2,1223000893,"[0, 0]",4,I bought the triliogy and have tested out all ...,"12 19, 2012",A2CR37UY3VR7BN,Michelle Ashbery,Entertaining,1355875200
3,1223000893,"[2, 2]",4,My female kitty could care less about these vi...,"05 12, 2011",A2A4COGL9VW2HY,Michelle P,Happy to have them,1305158400
4,1223000893,"[6, 7]",3,"If I had gotten just volume two, I would have ...","03 5, 2012",A2UBQA85NIGLHA,"Tim Isenhour ""Timbo""",You really only need vol 2,1330905600


In [5]:
raw_data['reviewText'] = raw_data['reviewText'].str.lower()
raw_data['summary'] = raw_data['summary'].str.lower()

In [6]:
raw_data['positive'] = np.where(raw_data['overall'] < 4, 0, 1)

In [7]:
keywords = ['great','good','love','recommend','nice','excellent', 'best','simple', 'lasted'
            ,'satisfied','quality','easy','well','happy', 'sturdy','pleased',
           'superb', 'value', 'must have', 'incredible','helpful','fine','deal']

for key in keywords:
    raw_data[str(key)] = raw_data.reviewText.str.contains(str(key), case=False)

In [8]:
X = raw_data[keywords]
Y = raw_data['positive']

In [9]:
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(X, Y)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(X)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    X.shape[0],
    (Y != y_pred).sum()
))

Number of mislabeled points out of a total 157836 points : 33595


The model correctly identified 78.7% of the data.

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X, Y, cv=10)

array([ 0.78623923,  0.78725291,  0.7871262 ,  0.78693614,  0.78725291,
        0.78661936,  0.78718956,  0.78725291,  0.78722595,  0.78716259])

Running cross validation, the accuracy scores are pretty consistent.  This indicates that out model is not overfitting at a variable amount. Yay!

In [11]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y, y_pred)

array([[    13,  33575],
       [    20, 124228]])

In [12]:
print('Sensitivity: ', 124228/(124228 + 20) )
print('Specificity: ', 13/(35575 + 13) )

Sensitivity:  0.999839031614191
Specificity:  0.0003652916713498932


Hmm.. Very good at identifying positive reviews.  Horrendous at identifying negative reviews. Could this be due to a class imbalance?

In [13]:
raw_data['overall'].value_counts()

5    96253
4    27995
3    15933
2     8907
1     8748
Name: overall, dtype: int64

A lot more reviews with 5s than anything else.  Could account for the low specificity of the model due to class imbalance.