In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
pd.options.display.max_rows = 3000
pd.options.display.max_colwidth = 1000

def my_confusion_matrix(array_Expected,array_Predicted,colName):
    a = np.array(confusion_matrix(array_Expected, array_Predicted ))
    totalExpectedFalse = a[0,0] + a[0,1]
    totalExpectedTrue = a[1,0] + a[1,1]
    correctFalse = a[0,0] 
    correctTrue = a[1,1] 
    correctTruePct = np.round(correctTrue / totalExpectedTrue,3)
    correctFalsePct = np.round(correctFalse / totalExpectedFalse,3)
    print('Regarding {}, model correctly predicted {} Negatives out of {} expected Negatives: {}'.format(
        colName,correctFalse,totalExpectedFalse,correctFalsePct))
    print('Regarding {}, model correctly predicted {} Positives out of {} expected Positives: {}'.format(
        colName,correctTrue,totalExpectedTrue,correctTruePct))    
    
    print(np.array(confusion_matrix(target, y_pred )))


def my_clean_series(df, col):
    df[col] = df[col].str.lower().str.strip()
    df[col] = df[col].str.replace('.','')
    df[col] = df[col].str.replace(",",'')
    df[col] = df[col].str.replace("!",'')
    df[col] = df[col].str.replace("?",'')
    df[col] = df[col].str.replace(";",'')
    df[col] = df[col].str.replace(":",'')
    df[col] = df[col].str.replace("'",'')
    df[col] = df[col].str.replace("-",'')
    df[col] = df[col].str.replace('"','')
    df[col] = df[col].str.replace('(','')
    df[col] = df[col].str.replace(')','')
    return df


def my_list_positivewords(df, colEvaluate, colPosNeg, posValue,  negValue):

    list_positivewords = []
    df.loc[df[colPosNeg] ==posValue, colEvaluate].str.split(' ').apply(list_positivewords.extend)

    #Load list into dataframe so that we can get counts in next step
    df_positive = pd.DataFrame(list_positivewords, columns=['Words'])
    df_positive['Words'] = my_clean_series(df_positive, 'Words')


    #Load counts into new dataframe
    array_positive = [df_positive['Words'].value_counts().index.tolist(), df_positive['Words'].value_counts().values.tolist()]  
    df_freq_positive = pd.DataFrame(array_positive).transpose()
    df_freq_positive.columns = ['Word','Positive_Freq']



    list_negativewords = []
    df.loc[df[colPosNeg] ==negValue, colEvaluate].str.split(' ').apply(list_negativewords.extend)
    df_negative = pd.DataFrame(list_negativewords, columns=['Words'])
    df_negative = my_clean_series(df_negative, 'Words')
    array_negative = [df_negative['Words'].value_counts().index.tolist(), df_negative['Words'].value_counts().values.tolist()]  
    df_freq_negative = pd.DataFrame(array_negative).transpose()
    df_freq_negative.columns = ['Word','Negative_Freq']

    merged = pd.merge(df_freq_negative, df_freq_positive, how='outer', left_on='Word', right_on='Word')
    return merged

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
sms_raw.head()

Unnamed: 0,spam,message
0,False,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,False,Ok lar... Joking wif u oni...
2,True,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,False,U dun say so early hor... U c already then say...
4,False,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
#Find common positive and negative words to build keywords list

df_freq = my_list_positivewords(sms_raw,'message','spam', True, False)

#df_freq[df_freq.Negative_Freq.isna()]

#df_freq[df_freq.Negative_Freq < df_freq.Positive_Freq].sort_values('Positive_Freq', ascending=False)
df_freq.head()

Unnamed: 0,Word,Negative_Freq,Positive_Freq
0,i,2193,45
1,you,1840,287
2,to,1562,689
3,,1488,121
4,the,1129,204


In [4]:
#Enumerate data
keywords = ['claim','prize','won','guaranteed','tone','award','entry',
            'xxx','sex','horny','nude',
            'weekly','bonus','18+','voucher','expire','win',
            'redeem','free', 'doing', 'later', 'said', 'lol', 'sure'
           ' k ', 'feel', 'went', 'morning', 'didnt', 'she ', 'he ',
            'sure', 'something', 'gonna', 'coming', 'haha', 'lunch',
            'meeting', 'yup', 'job', ' eat', 'dinner', 'watching',
            ' ok ', 'probably', 'actually', 'might', 'hes ', 'shes ',
            'forgot', 'remember', ' thx', 'thanks', 'dunno', 'sorry',
            'dogging','/min','/hr','credit','spree','pobox','po box',
            'singles', '/p', ' per', 
            '16+','£', 'wkly', ' opt', 'ntt', 'bluetooth',
            'couk',' bid', 'cash', 'pm', 'http', 'subscrib',
            'network','ntwk','calls',
            'stop','reply','mobile','&','urgent','contact','win',
            'draw','mins','apply','video',' live', 'sms', 'txt', 'text']


notused = ['claim','prize','won','guaranteed','tone','award','entry',
            'xxx','sex','horny','nude',
            'weekly','bonus','18+','voucher','expire','win',
            'redeem','free', 
            'dogging','/min','/hr','credit','spree','pobox','po box',
            'singles', '/p', ' per', 
            '16+','£', 'wkly', ' opt', 'ntt', 'bluetooth',
            'couk',' bid', 'cash', 'pm', 'http', 'subscrib',
            'network','ntwk','calls',
            'stop','reply','mobile','&','urgent','contact','win',
            'draw','mins','apply','video',' live', 'sms', 'txt', 'text']

notused = ['doing', 'later', 'said', 'lol', 'sure'
           ' k ', 'feel', 'went', 'morning', 'didnt', 'she ', 'he ',
            'sure', 'something', 'gonna', 'coming', 'haha', 'lunch',
            'meeting', 'yup', 'job', ' eat', 'dinner', 'watching',
            ' ok ', 'probably', 'actually', 'might', 'hes ', 'shes ',
            'forgot', 'remember', ' thx', 'thanks', 'dunno', 'sorry',
            ]


for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
model = bnb.fit(data, target)
y_pred = model.predict(data)

In [5]:
#Check cross validation with 10 different holdout groups
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.97849462, 0.96594982, 0.97491039, 0.98028674, 0.97670251,
       0.96229803, 0.97127469, 0.97661871, 0.9676259 , 0.98201439])

In [6]:
#Check performance on Type 1 vs Type 2 errors
my_confusion_matrix(target,y_pred,'Spam')

Regarding Spam, model correctly predicted 4793 Negatives out of 4825 expected Negatives: 0.993
Regarding Spam, model correctly predicted 637 Positives out of 747 expected Positives: 0.853
[[4793   32]
 [ 110  637]]


In [7]:
# Check training values that model got wrong
sms_raw['y_pred'] = y_pred

sms_raw.loc[sms_raw.spam != sms_raw.y_pred, ['message','spam','y_pred']].sort_values('spam').head(5)

Unnamed: 0,message,spam,y_pred
1290,"Hey...Great deal...Farm tour 9am to 5pm $95/pax, $50 deposit by 16 May",False,True
945,"I sent my scores to sophas and i had to do secondary application for a few schools. I think if you are thinking of applying, do a research on cost also. Contact joke ogunrinde, her school is one me the less expensive ones",False,True
4609,We live in the next &lt;#&gt; mins,False,True
1164,"New Theory: Argument wins d SITUATION, but loses the PERSON. So dont argue with ur friends just.. . . . kick them &amp; say, I'm always correct.!",False,True
4471,"Lemme know when I can swing by and pick up, I'm free basically any time after 1 all this semester",False,True


<b>FINDINGS</b>

<u>With both Spam and Ham Keywords</u><br>
Regarding Spam, model correctly predicted 4793 Negatives out of 4825 expected Negatives: 0.993
Regarding Spam,model correctly predicted 637 Positives out of 747 expected Positives: 0.853

array([0.97849462, 0.96594982, 0.97670251, 0.98028674, 0.97491039,
       0.96229803, 0.97127469, 0.97661871, 0.9676259 , 0.98201439])
       
<u>With only Spam Keywords</u><br>
Regarding Spam, model correctly predicted 4790 Negatives out of 4825 expected Negatives: 0.993
Regarding Spam,model correctly predicted 636 Positives out of 747 expected Positives: 0.851

array([0.97849462, 0.96594982, 0.97670251, 0.97849462, 0.97491039,
       0.96588869, 0.97307002, 0.97482014, 0.9676259 , 0.98201439])
       
<u>With only Ham Keywords</u><br>
Regarding Spam, model correctly predicted 4825 Negatives out of 4825 expected Negatives: 1.0
Regarding Spam,model correctly predicted 0 Positives out of 747 expected Positives: 0.0

array([0.8655914 , 0.86379928, 0.8655914 , 0.8655914 , 0.8655914 ,
       0.86535009, 0.86535009, 0.86690647, 0.86690647, 0.86690647])

