In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [7]:
y_pred

array([False, False, False, ..., False, False, False])

In [11]:
sms_raw['prediction'] = y_pred; sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,allcaps,prediction
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False


##### Compare spam v. prediction for Accuracy

In [20]:
sms_raw['accuracy'] = sms_raw[['spam','prediction']].apply(
            lambda x: True if x['spam'] == x['prediction'] else False, axis=1
)

In [38]:
incorrect = sms_raw['accuracy'].value_counts()[0]
correct = sms_raw['accuracy'].value_counts()[1]

In [43]:
correct / (incorrect + correct) * 100

89.16008614501077

In [47]:
sms_raw['prediction'].value_counts()

False    5319
True      253
Name: prediction, dtype: int64

In [69]:
#type I/false positive

sms_spam = sms_raw[sms_raw['spam'] == False]
false_pos = sms_spam['accuracy'].value_counts()

#type II/false negative
sms_spam = sms_raw[sms_raw['spam'] == True]
false_neg = sms_spam['accuracy'].value_counts()

##### Create confusion matrix

In [68]:
np.array([[false_pos[1],false_pos[0]],
          [false_neg[0],false_neg[1]]])

array([[4770,   55],
       [ 549,  198]])