In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [42]:
# Calculate the accuracy of your model here.
# Display our results.
print("Number of mislabeled points out of a total {} points : {} or {:.2f}%".format(
    data.shape[0],(target != y_pred).sum(),(target != y_pred).sum()/data.shape[0]*100
     ))

Number of mislabeled points out of a total 5572 points : 604 or 10.84%


In [104]:
def calc_accuracy(conf_matrix):
    total = sum(sum(l) for l in conf_matrix)
    return sum(conf_matrix[i][i] for i in range(len(conf_matrix))) / total

In [105]:
y_actual = np.array(sms_raw['spam'])

calc_accuracy(confusion_matrix(y_actual,y_pred))

0.8916008614501076

### Confusion Matrix
- shows the counts for when a message was ham and we predicted ham, when a message was ham and we predicted spam, when a message was spam and we predicted ham, and when a message was spam and we predicted spam.
- the columns are prediction and the rows are actual.

In [147]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]])

In [148]:
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,allcaps,Pred
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False


In [149]:
y_pred

array([False, False, False, ..., False, False, False])

In [150]:
sms_raw['Pred'] = y_pred
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,allcaps,Pred
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False


## Build Confusion Matrix

In [151]:
def get_conf_matrix(df, actual, pred):
    tn = len(df[(df[actual]==False) & (df[pred]==False)])
    fp = len(df[(df[actual]==False) & (df[pred]==True)])
    tp = len(df[(df[actual]==True) & (df[pred]==True)])
    fn = len(df[(df[actual]==True) & (df[pred]==False)])
    result = [[tn,fp],[fn,tp]]
    return result
    

get_conf_matrix(sms_raw, 'spam', 'Pred')

[[4770, 55], [549, 198]]

## Calculate Sensitivity

In [152]:
#percentage of positives correctly identified; 198/747  

def calc_sensitivity(df, actual, pred):
    result = len(df[(df[actual]==True) & (df[pred]==True)])/len(df[df['spam']==True])
    return result

In [153]:
calc_sensitivity(sms_raw, 'spam', 'Pred')

0.26506024096385544

## Calculate Specificity

In [144]:
#percentage of negatives correctly identified; 198/747 

def calc_specificity(df, actual, pred):
    result = len(df[(df[actual]==False) & (df[pred]==False)])/len(df[df['spam']==False])
    return result

In [141]:
calc_specificity(sms_raw, 'spam', 'Pred')

0.9886010362694301