In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

ama_path = ("amazon_cells_labelled.txt")
ama = pd.read_csv(ama_path, delimiter= '\t', header=None)
ama.columns = ['message', 'rating']
print(ama)

                                               message  rating
0    So there is no way for me to plug it in here i...       0
1                          Good case, Excellent value.       1
2                               Great for the jawbone.       1
3    Tied to charger for conversations lasting more...       0
4                                    The mic is great.       1
5    I have to jiggle the plug to get it to line up...       0
6    If you have several dozen or several hundred c...       0
7          If you are Razr owner...you must have this!       1
8                  Needless to say, I wasted my money.       0
9                     What a waste of money and time!.       0
10                     And the sound quality is great.       1
11   He was very impressed when going from the orig...       1
12   If the two were seperated by a mere 5+ ft I st...       0
13                            Very good quality though       1
14   The design is very odd, as the ear "clip" is n... 

In [2]:
keywords = ['good', 'excellent', 'great', 'awesome', 'fine', 'highly', 'love']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    ama[str(key)] = ama.message.str.contains(
        str(key) + ' ',
        case=False
    )

In [20]:
newdata = ama[keywords]
target = ama['rating']
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(newdata, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(newdata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    newdata.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 353


In [21]:
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(newdata, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(newdata, target).score(newdata, target)))

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, newdata, target, cv=10)

With 20% Holdout: 0.61
Testing on Sample: 0.647


array([0.68, 0.66, 0.72, 0.66, 0.66, 0.64, 0.62, 0.61, 0.65, 0.57])

This is the base result for one set of classifiers. As can be seen the average accuracy is around 62%. I will now try to add
additional features to see if I can make the model more accurate.

In [13]:
newkeywords = ['good', 'awesome', 'fine', 'love', 'acceptable','excellent','exceptional','favorable','great','marvelous','positive','satisfactory','satisfying','superb','valuable','wonderful','ace','boss','bully','capital','choice','crack','nice','pleasing','prime','rad','sound','spanking','sterling','super','superior','welcome','worthy','admirable','agreeable','commendable','congenial','deluxe','first-class','first-rate','gnarly','gratifying','honorable','neat','precious','recherché','reputable','select','shipshape','splendid','stupendous','super-eminent','super-excellent','tip-top','up to snuff']
for key in newkeywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    ama[str(key)] = ama.message.str.contains(
        str(key) + ' ',
        case=False
    )
    
newdata = ama[newkeywords]
target = ama['rating']
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(newdata, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(newdata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    newdata.shape[0],
    (target != y_pred).sum()
))

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(newdata, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(newdata, target).score(newdata, target)))

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, newdata, target, cv=10)

Number of mislabeled points out of a total 1000 points : 343
With 20% Holdout: 0.615
Testing on Sample: 0.657


array([0.7 , 0.69, 0.73, 0.66, 0.62, 0.65, 0.64, 0.59, 0.63, 0.58])

For this example I tried modifying the keywords to have a larger variety (by adding more words using words synonymous with good). As can be seen the amount of mislabed points only decreased slightly and the accuracy only increased slightly as well. This can be used as a indicator that first classifier set was already quite accurate with its amount of keywords it had and adding more keywords would not be the best way to attain a higher accuracy. Now lets test if having less keywords besides good 

In [14]:
newkeywords = ['good']
for key in newkeywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    ama[str(key)] = ama.message.str.contains(
        str(key) + ' ',
        case=False
    )

newdata = ama[newkeywords]
target = ama['rating']
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(newdata, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(newdata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    newdata.shape[0],
    (target != y_pred).sum()
))

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(newdata, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(newdata, target).score(newdata, target)))

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, newdata, target, cv=10)

Number of mislabeled points out of a total 1000 points : 461
With 20% Holdout: 0.53
Testing on Sample: 0.539


array([0.52, 0.56, 0.56, 0.57, 0.51, 0.56, 0.51, 0.5 , 0.58, 0.52])

For this I tried using only 'good' in the keywords which still resulted in about a 53% success rate in detecting that it was a positive review. 

In [23]:
newkeywords = ['']
for key in newkeywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    ama[str(key)] = ama.message.str.contains(
        str(key) + ' ',
        case=False
    )

newdata = ama[newkeywords]
target = ama['rating']
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(newdata, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(newdata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    newdata.shape[0],
    (target != y_pred).sum()
))

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(newdata, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(newdata, target).score(newdata, target)))

from sklearn.model_selection import cross_val_score
cross_val_score(bnb, newdata, target, cv=10)

Number of mislabeled points out of a total 1000 points : 500
With 20% Holdout: 0.475
Testing on Sample: 0.5


array([0.5 , 0.49, 0.5 , 0.5 , 0.49, 0.5 , 0.5 , 0.5 , 0.49, 0.49])

Do any of your classifiers seem to overfit?
I'd say currently none of the classifiers seem to overfit as there are no specific features which would not be able to work with other reviews, but if one of them were to overfit it would definently be the one with the most keywords. 

Which seem to perform the best? Why?
The one that performed the best would be the one with the most keywords as it has the largest range of words to match for a positive result. 

Which features seemed to be most impactful to performance?
Name the keyword 'good', seems to be the most impactful as it was able to detect roughly 50% of all the positive reviews by itself. 