In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
df = pd.read_csv('amazon_cells_labelled.txt', sep="\t", header=None)
df.columns = ['review', 'type']

#Set up the keywords

keywords = ["won\'t", 'refund', 'disappointed', 'horrible', 'not', 'hate', 'terrible', 'bad', "don\'t", 'died', 
            'wasted', 'waste', 'return', 'useless', 'died', 'dead', 'broke', 'broken', 'excessive', 
            'problem', 'unusable', "wouldn\'t", "couldn\'t", 'garbage']

#remove: worthless, misleading, unacceptable 

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df[str(key)] = df.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    
df['allcaps'] = df.review.str.isupper()

#This switches it so negative reviews are True, positive reviews are False

df['reviews'] = (df['type'] == 0)

data = df[keywords + ['allcaps']]
target = df['reviews']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

print("Accuracy : {}%".format(
    bnb.score(data, target) * 100
))

Number of mislabeled points out of a total 1000 points : 372
Accuracy : 62.8%


In [3]:
print(df['type'].value_counts())

1    500
0    500
Name: type, dtype: int64


In [4]:
# Create the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[478,  22],
       [350, 150]])

 350 of our 372 errors are from failing to identify a bad review (False negative - Type II Error)
 
 22 of our 372 errors are from incorrectly identifying a good review (False positive - Type I Error)
 
 Sensitivity is the percentage of positives correctly identified, in our case 150/500 or 30%
 
 Specificity is the opposite and ours is  478/500 or 95.6%.

In [5]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.645
Testing on Sample: 0.628


In [6]:
# Cross validation

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.64, 0.61, 0.65, 0.64, 0.64, 0.54, 0.66, 0.61, 0.65, 0.58])

## Version 2 of Classifier (Attempt to Lower Overfit)

In [7]:
# Grab and process the raw data.
df2 = pd.read_csv('amazon_cells_labelled.txt', sep="\t", header=None)
df2.columns = ['review', 'type']

#Set up the keywords

keywords2 = ["hate", "refund", "useless", "stupid", "broken", "damaged"]


for key in keywords2:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df2[str(key)] = df2.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    
df2['allcaps'] = df2.review.str.isupper()

#This switches it so negative reviews are True, positive reviews are False

df2['reviews'] = (df2['type'] == 0)

data2 = df2[keywords2 + ['allcaps']]
target2 = df2['reviews']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb2 = BernoulliNB()

# Fit our model to the data.
bnb2.fit(data2, target2)

# Classify, storing the result in a new variable.
y_pred = bnb2.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target2 != y_pred).sum()
))

print("Accuracy : {}%".format(
    bnb2.score(data2, target2) * 100
))

Number of mislabeled points out of a total 1000 points : 483
Accuracy : 51.7%


In [8]:
# Create the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[495,   5],
       [478,  22]])

In [9]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.645
Testing on Sample: 0.628


In [10]:
# Cross validation

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.64, 0.61, 0.65, 0.64, 0.64, 0.54, 0.66, 0.61, 0.65, 0.58])

__Analysis__: Worse output and no real change in the overfitting

## Version 3 of Classifier: Identifying Positives Instead of Negatives

In [11]:
# Grab and process the raw data.
df3 = pd.read_csv('amazon_cells_labelled.txt', sep="\t", header=None)
df3.columns = ['review', 'type']

#Set up the keywords

keywords3 = ["love", "perfect", "exactly", "happy", "will", "repeat", 'definitely', 'amazing', 'wonderful', "10/10",
            "10", 'awesome', 'as expected', 'pleased', 'fulfills', 'recommended', 'impressed', 'works', 
             'reasonable', 'well', 'joy', 'great', 'good']


for key in keywords3:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df3[str(key)] = df3.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    

# Positive Reviews are True, negative reviews are False

df3['reviews'] = (df3['type'] == 1)

data3 = df3[keywords3]
target3 = df3['reviews']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb3 = BernoulliNB()

# Fit our model to the data.
bnb3.fit(data3, target3)

# Classify, storing the result in a new variable.
y_pred = bnb3.predict(data3)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data3.shape[0],
    (target3 != y_pred).sum()
))

print("Accuracy : {}%".format(
    bnb3.score(data3, target3) * 100
))

Number of mislabeled points out of a total 1000 points : 374
Accuracy : 62.6%


In [12]:
# Create the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[354, 146],
       [480,  20]])

In [13]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.645
Testing on Sample: 0.628


In [14]:
# Cross validation

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.64, 0.61, 0.65, 0.64, 0.64, 0.54, 0.66, 0.61, 0.65, 0.58])

No great diference between identifying by positive or negative review, so let's go back to the negatives and try to find some commonalities or look at unique words

## Version 4

In [15]:
# Grab and process the raw data.
df4 = pd.read_csv('amazon_cells_labelled.txt', sep="\t", header=None)
df4.columns = ['review', 'type']

#Set up the keywords

keywords4 = ['doesnt', 'however', 'sucks', 'mistake', 'difficult', 'instructions', 'unreliable', 'later', 
             'none', 'week', 'nothing', 'player', 'horrible', 'cant', 'disappointment', 'worst', 'first' 
             'broke', '5', 'support', 'unfortunately', 'stay', 'disappointed', 'junk', 'company', 'terrible'
             'poor', 'anything', 'completely', 'back', 'disappointing', 'old', 'went', 'talk', 'return', 'easily', 
             'useless', 'didnt', 'weak', 'buying', 'waste', 'unit', 'money', 'hate', 'crap', 'bad', 'customer']
            

for key in keywords4:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df4[str(key)] = df4.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
)
    

# Positive Reviews are True, negative reviews are False

df4['reviews'] = (df4['type'] == 1)

data4 = df4[keywords4]
target4 = df4['reviews']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb4 = BernoulliNB()

# Fit our model to the data.
bnb4.fit(data4, target4)

# Classify, storing the result in a new variable.
y_pred = bnb4.predict(data4)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data4.shape[0],
    (target4 != y_pred).sum()
))

print("Accuracy : {}%".format(
    bnb4.score(data4, target4) * 100
))

Number of mislabeled points out of a total 1000 points : 385
Accuracy : 61.5%


In [16]:
# Create the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(target4, y_pred)

array([[122, 378],
       [  7, 493]])

So we got a lot better at identifying negative reviews but a loottttt worse about accidentally identifying positive reviews as negative