In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [9]:
pred_tot_No = np.sum(y_pred == False)
pred_tot_Yes = np.sum(y_pred == True)

In [16]:
actual_tot_No = target[target == False].count()
actual_tot_Yes = target[target == True].count()

In [19]:
def manually_calculate_tp_tn_fp_fn(test_y, test_predictions):
    """Manually create confusion matrix by comparing predictions with answers."""
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    x = 0
    while x < len(test_y):
        if test_y[x] == 0 and test_predictions[x] == 0:
            TN += 1
        if test_y[x] == 0 and test_predictions[x] == 1:
            FP += 1
        if test_y[x] == 1 and test_predictions[x] == 1:
            TP += 1
        if test_y[x] == 1 and test_predictions[x] == 0:
            FN += 1
        x += 1

    cm = [[TN, FP],[FN, TP]]
    return cm

In [24]:
manual = manually_calculate_tp_tn_fp_fn(target, y_pred)
print(manual)

[[4770, 55], [549, 198]]


In [27]:
def accuracy(cm):
    n = manual[0][0] + manual[1][1]
    d = manual[0][0] + manual[0][1] + manual[1][0] + manual[1][1]
    return (n/d)*100

In [28]:
print('Accuracy: {}%'.format(accuracy(manual)))

Accuracy: 89.16008614501077%
