In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

## Load Naive Bayes Classifier Model

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

## Accuracy
Calculate model accuracy.

In [3]:
correct_predictions = (target == y_pred).sum()
total = data.shape[0]
accuracy = ((correct_predictions) / total) * 100
print(f'Accuracy: {accuracy:0.2f}')

Accuracy: 89.16


## Drill
Compute confussion matrix, specificity and sensitivity by hand.

In [4]:
# Create confusion matrix
true_positive_count = (target[target == 0] == y_pred[target == 0]).sum()
true_negative_count = (target[target == 1] == y_pred[target == 1]).sum()
false_negative_count = (target[target == 0] != y_pred[target == 0]).sum()
false_positive_count = (target[target == 1] != y_pred[target == 1]).sum()
confusion_matrix = [[true_positive_count, false_negative_count], [false_positive_count, true_negative_count]]
print(f'confusion matrix: {confusion_matrix}')

confusion matrix: [[4770, 55], [549, 198]]


In [5]:
# Compute sensitivity
total_positives = true_negative_count + false_positive_count
sensitivity = (true_negative_count / total_positives) * 100
print(f'sensitivity: {sensitivity:0.1f}%')

sensitivity: 26.5%


In [6]:
# Compute specificity
total_negatives = true_positive_count + false_negative_count
specificity = (true_positive_count / total_negatives) * 100
print(f'specificity: {specificity:0.1f}%')

specificity: 98.9%
