# Challenge: Iterate and evaluate your classifier

In [34]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
from textblob import TextBlob
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
data_path = ('yelp_labelled.txt')
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['message','like']

In [3]:
stop = stopwords.words('english')

In [None]:
sms_raw.head()

In [None]:
sms_raw.describe()

In [4]:
sms_raw['message'] = sms_raw['message'].str.lower()

In [5]:
sms_raw1 = sms_raw.copy()
sms_raw2 = sms_raw.copy()
sms_raw3 = sms_raw.copy()
sms_raw4 = sms_raw.copy()
sms_raw5 = sms_raw.copy()

In [6]:
bad_reviews_df = sms_raw[sms_raw['like'] == 0]
good_reviews_df = sms_raw[sms_raw['like'] == 1]

In [7]:
# remove stop words
bad_reviews_df = sms_raw[sms_raw['like'] == 0]
bad_reviews = bad_reviews_df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
good_reviews_df = sms_raw[sms_raw['like'] == 1]
good_reviews = good_reviews_df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [8]:
# split the words
bad_reviews = bad_reviews.str.split(' ')
good_reviews = good_reviews.str.split(' ')

In [9]:
list(good_reviews)

[['wow...', 'loved', 'place.'],
 ['stopped',
  'late',
  'may',
  'bank',
  'holiday',
  'rick',
  'steve',
  'recommendation',
  'loved',
  'it.'],
 ['selection', 'menu', 'great', 'prices.'],
 ['fries', 'great', 'too.'],
 ['great', 'touch.'],
 ['service', 'prompt.'],
 ['tried', 'cape', 'cod', 'ravoli,', 'chicken,with', 'cranberry...mmmm!'],
 ['highly', 'recommended.'],
 ['food,', 'amazing.'],
 ['service', 'also', 'cute.'],
 ['could', 'care', 'less...', 'interior', 'beautiful.'],
 ['performed.'],
 ["that's",
  'right....the',
  'red',
  'velvet',
  'cake.....ohhh',
  'stuff',
  'good.'],
 ['hole',
  'wall',
  'great',
  'mexican',
  'street',
  'tacos,',
  'friendly',
  'staff.'],
 ['also',
  'combos',
  'like',
  'burger,',
  'fries,',
  'beer',
  '23',
  'decent',
  'deal.'],
 ['found', 'place', 'accident', 'could', 'happier.'],
 ['overall,', 'like', 'place', 'lot.'],
 ['redeeming', 'quality', 'restaurant', 'inexpensive.'],
 ['ample', 'portions', 'good', 'prices.'],
 ['first', 'visit

In [10]:
keywords = ['fun', 'wonderful', 'loved', 'amazing', 'delicious', 'great', 'good']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [None]:
sns.heatmap(sms_raw.corr())

In [12]:
data = sms_raw[keywords]
target = sms_raw['like']

In [13]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 410


In [14]:
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.59


In [41]:
confusion_matrix(target, y_pred)

array([[485,  15],
       [395, 105]], dtype=int64)

In [15]:
new_keywords1 = ['fun', 'wonderful', 'loved', 'amazing', 'delicious', 'great', 'good']
new_keywords2 = ['loved', 'amazing', 'delicious', 'great', 'like']
new_keywords3 = ['tasty', 'wonderful', 'delicious', 'great', 'best']
new_keywords4 = ['loved', 'inexpensive', 'bargain', 'clean', 'like']
new_keywords5 = ['fresh', 'authentic', 'bargain', 'clean', 'good']

def addkey(text, keywords):
    for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
        text[str(key)] = text.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [16]:
addkey(sms_raw1, new_keywords1)
addkey(sms_raw2, new_keywords2)
addkey(sms_raw3, new_keywords3)
addkey(sms_raw4, new_keywords4)
addkey(sms_raw5, new_keywords5)

sms_raw3.head()

In [18]:
data1 = sms_raw1[new_keywords1]
data2 = sms_raw2[new_keywords2]
data3 = sms_raw3[new_keywords3]
data4 = sms_raw4[new_keywords4]
data5 = sms_raw5[new_keywords5]

In [30]:
bnb.fit(data2, target)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data2, target).score(data2, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.555


In [35]:
y_pred2 = bnb.predict(data2)
confusion_matrix(target, y_pred2)

array([[471,  29],
       [434,  66]], dtype=int64)

In [31]:
bnb.fit(data3, target)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data3, target).score(data3, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.559


In [36]:
y_pred3 = bnb.predict(data3)
confusion_matrix(target, y_pred3)

array([[496,   4],
       [442,  58]], dtype=int64)

In [32]:
bnb.fit(data4, target)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data4, target).score(data4, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.514


In [37]:
y_pred4 = bnb.predict(data4)
confusion_matrix(target, y_pred4)

array([[470,  30],
       [474,  26]], dtype=int64)

In [33]:
bnb.fit(data5, target)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data5, target).score(data5, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.537


In [38]:
y_pred5 = bnb.predict(data5)
confusion_matrix(target, y_pred5)

array([[481,  19],
       [444,  56]], dtype=int64)

#### Do any of your classifiers seem to overfit?

data4 most was overfit with at least one of it's features

#### Which seem to perform the best? Why?

data1, I gave the first keyword group 2 more keywords, I believe that was a factor in getting better results. To get more accurate results I would likely need to add many more keyboards

#### Which features seemed to be most impactful to performance?

good was helpful, in the keywords I used it in I found very few false positives