In [10]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [11]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )

sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [12]:
print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    data.shape[0],
    (target == y_pred).sum(),
    round((target == y_pred).sum()/data.shape[0]*100, 2)
))

Number of properly labeled points out of a total 5572 points : 4968.  This equates to an accuracy of 89.16%


On the complete set of data the accuracy was 89%.  Checking with cross validation now.

In [105]:
#make 3 folds of the data of equal size
fold_one = sms_raw[keywords + ['allcaps']].ix[:1856]
fold_two = sms_raw[keywords + ['allcaps']].ix[1857:3713]
fold_three = sms_raw[keywords + ['allcaps']].ix[3714:5570]

#make targets for the 3 folds of data
fold_one_tar = sms_raw['spam'].ix[:1856]
fold_two_tar = sms_raw['spam'].ix[1857:3713]
fold_three_tar = sms_raw['spam'].ix[3714:5570]

In [106]:
#use fold 1 as the training fold and test the other two folds
fold1_test_2 = bnb.fit(fold_one, fold_one_tar).predict(fold_two)
fold1_test_3 = bnb.fit(fold_one, fold_one_tar).predict(fold_three)

In [108]:
def checktest(data, target, prediction):
    print("Number of properly labeled points out of a total {} points : {}.  This equates to an accuracy of {}%".format(
    data.shape[0],
    (target == prediction).sum(),
    round((target == prediction).sum()/data.shape[0]*100, 2)
))

In [109]:
checktest(fold_two, fold_two_tar, fold1_test_2)

Number of properly labeled points out of a total 1857 points : 1667.  This equates to an accuracy of 89.77%


In [110]:
checktest(fold_three, fold_three_tar, fold1_test_3)

Number of properly labeled points out of a total 1857 points : 1644.  This equates to an accuracy of 88.53%


In the first round testing the algorithm using fold one as training data we achieved a roughly 89% accuracy.  Now to use fold two as the training data.

In [115]:
#use fold 2 as the training fold and test the other two folds
fold2_test_1 = bnb.fit(fold_two, fold_two_tar).predict(fold_one)
fold2_test_3 = bnb.fit(fold_two, fold_two_tar).predict(fold_three)

In [116]:
checktest(fold_one, fold_one_tar, fold2_test_1)

Number of properly labeled points out of a total 1857 points : 1651.  This equates to an accuracy of 88.91%


In [117]:
checktest(fold_three, fold_three_tar, fold2_test_3)

Number of properly labeled points out of a total 1857 points : 1645.  This equates to an accuracy of 88.58%


In the second round of testing we again achieved a roughly 89% accuracy. Now to cross validate for the third time with fold 3 as the training data.

In [118]:
#use fold 3 as the training fold and test the other two folds
fold3_test_1 = bnb.fit(fold_three, fold_three_tar).predict(fold_one)
fold3_test_2 = bnb.fit(fold_three, fold_three_tar).predict(fold_two)

In [119]:
checktest(fold_one, fold_one_tar, fold3_test_1)

Number of properly labeled points out of a total 1857 points : 1651.  This equates to an accuracy of 88.91%


In [120]:
checktest(fold_two, fold_two_tar, fold3_test_2)

Number of properly labeled points out of a total 1857 points : 1669.  This equates to an accuracy of 89.88%


For all the cross validation tests we did we achieved a roughly 89% accuracy suggesting that it is unlikely that any point in the data is skewing the results. 