In [None]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )

sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [88]:
print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    data.shape[0],
    (target == y_pred).sum(),
    round((target == y_pred).sum()/data.shape[0]*100, 2)
))

Number of properly labeled points out of a total 5572 points : 4968.  This equates to an accuracy of 89.16%


On the complete set of data the accuracy was 89%.  Checking with holdout groups now.

In [78]:
#Holdout Group 1, used for training the algorithm
sms_holdout1 = sms_raw[:2000]
training_data = sms_holdout1[keywords + ['allcaps']]
target_data= sms_holdout1['spam'] 

#Holdout Group 2, used for first test of the trained algorithm
sms_holdout2 = sms_raw[2001:4000]
data2=sms_holdout2[keywords + ['allcaps']]
data2_target = sms_holdout2['spam'] 

#Holdout Group 2, used for second test of the trained algorithm
sms_holdout3 = sms_raw[4001:5572]
data3 = sms_holdout3[keywords + ['allcaps']]
data3_target = sms_holdout3['spam'] 

In [79]:
t_pred = bnb.fit(training_data, target_data).predict(training_data)

In [80]:
print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    training_data.shape[0],
    (target_data == t_pred).sum(),
    round((target_data == t_pred).sum()/training_data.shape[0]*100, 2)
))

Number of properly labeled points out of a total 2000 points : 1783.  This equates to an accuracy of 89.15%


<h4>Now that we have trained the algorthm and tested it on the training data lets take a look at our two holdout groups.</h4>

In [81]:
test1 = bnb.fit(training_data, target_data).predict(data2)


In [82]:
print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    data2.shape[0],
    (data2_target == test1).sum(),
    round((data2_target == test1).sum()/data2.shape[0]*100, 2)
))

Number of properly labeled points out of a total 1999 points : 1785.  This equates to an accuracy of 89.29%


<h4>Test group 1 looks to match the results from our training data, so lets take a look at test group 2.</h4>

In [83]:
test2 = bnb.fit(training_data, target_data).predict(data3)

In [84]:
print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    data3.shape[0],
    (data3_target == test2).sum(),
    round((data3_target == test2).sum()/data3.shape[0]*100, 2)
))

Number of properly labeled points out of a total 1571 points : 1395.  This equates to an accuracy of 88.8%


<h4>It seems that our trained algorithm works roughly as well on our tested data as it does on the testing data we held back.</h4>
