# Filterting Spam with Naive Bayes

This project will go through 5,572 SMS messages and utilize Naive Bayes to predict if a message is a spam or not.

In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

spam = pd.read_csv('SMSSpamCollection', sep = '\t', header = None, names = ['Label', 'SMS'])

print(spam.shape)
spam.head()


(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
n_spam = round(spam['Label'].value_counts(normalize=True)[0]*100,2)
n_ham = round(spam['Label'].value_counts(normalize=True)[1]*100,2)

print('Percentage of Spam: {0}% \nPercentage of Non-Spam: {1}%'.format(n_spam,n_ham))

Percentage of Spam: 86.59% 
Percentage of Non-Spam: 13.41%


# Data Split

In [3]:
random_spam = spam.sample(frac=1, random_state = 1)
split = int(round(spam.shape[0] * 0.8,0))

train_spam = random_spam[:split].reset_index(drop=True)
test_spam = random_spam[split:].reset_index(drop=True)

In [4]:
n_train_spam = round(train_spam['Label'].value_counts(normalize=True)[0]*100,2)
n_train_ham = round(train_spam['Label'].value_counts(normalize=True)[1]*100,2)

print('Number of Rows: {0}\nPercentage of Spam for the training set: {1}% \nPercentage of Non-Spam for the training set: {2}%\n'.format(train_spam.shape[0], n_train_spam,n_train_ham))

n_test_spam = round(test_spam['Label'].value_counts(normalize=True)[0]*100,2)
n_test_ham = round(test_spam['Label'].value_counts(normalize=True)[1]*100,2)

print('Number of Rows: {0}\nPercentage of Spam for the test set: {1}% \nPercentage of Non-Spam for the test set: {2}%'.format(test_spam.shape[0],n_test_spam,n_test_ham))

Number of Rows: 4458
Percentage of Spam for the training set: 86.54% 
Percentage of Non-Spam for the training set: 13.46%

Number of Rows: 1114
Percentage of Spam for the test set: 86.8% 
Percentage of Non-Spam for the test set: 13.2%


# Data Cleaning

In [5]:
#Before Cleaning 
train_spam.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [6]:
import re

train_spam['SMS'] = train_spam['SMS'].apply(lambda x: re.sub('\W',' ',x.lower()))
test_spam['SMS'] = test_spam['SMS'].apply(lambda x: re.sub('\W',' ',x.lower()))

#After Cleaning
train_spam.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [7]:
train_spam['SMS'] = train_spam['SMS'].str.split()
vocabulary = []

for i in train_spam['SMS']:
    for word in i:
        vocabulary.append(word)
    
vocabulary = list(set(vocabulary))

In [8]:
print('There are {0} unique words'.format(len(vocabulary)))

There are 7783 unique words


In [9]:
word_counts_per_sms = {unique_word: [0] * len(train_spam['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(train_spam['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [10]:
df = pd.DataFrame(word_counts_per_sms)
df.head()

Unnamed: 0,polys,yo,ldnw15h,telling,i,tho,min,suddenly,supreme,hogolo,...,21st,tix,shaking,wesleys,50award,pouts,atlast,oso,ham,amanda
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_spam_clean = pd.concat([train_spam, df], axis=1)
train_spam_clean.head()

Unnamed: 0,Label,SMS,polys,yo,ldnw15h,telling,i,tho,min,suddenly,...,21st,tix,shaking,wesleys,50award,pouts,atlast,oso,ham,amanda
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Naive Bayes Calculation

Here, we will use multinomial Naive Baynes to calculate the probability of a spam. We will also use Laplace smoothing and set **α** = 1

The two equations we are going to use to determine if a SMS message is a spam or not are:

\begin{equation}
P(Spam | w_1,w_2, ..., w_n) \propto P(Spam) \cdot \prod_{i=1}^{n}P(w_i|Spam) \\\
P(Ham | w_1,w_2, ..., w_n) \propto P(Ham) \cdot \prod_{i=1}^{n}P(w_i|Ham)
\end{equation}

where...


\begin{equation}
P(w_i|Spam) = \frac{N_{w_i|Spam} + \alpha}{N_{Spam} + \alpha \cdot N_{Vocabulary}} \\\
P(w_i|Ham) = \frac{N_{w_i|Ham} + \alpha}{N_{Ham} + \alpha \cdot N_{Vocabulary}}
\end{equation}




In [12]:
p_spam = train_spam_clean['Label'].value_counts(normalize=True)[0]
p_ham = train_spam_clean['Label'].value_counts(normalize=True)[1]
n_spam = train_spam_clean[train_spam_clean['Label'] == 'spam'].iloc[:, 2:].sum().sum()
n_ham = train_spam_clean[train_spam_clean['Label'] == 'ham'].iloc[:, 2:].sum().sum()
n_vocabulary = len(vocabulary)

In [13]:
def calc_params(a=1):
    # Initiate parameters
    parameters_spam = {unique_word:0 for unique_word in vocabulary}
    parameters_ham = {unique_word:0 for unique_word in vocabulary}
    
    #split to spam and ham
    spam = train_spam_clean[train_spam_clean['Label'] == 'spam']
    ham = train_spam_clean[train_spam_clean['Label'] == 'ham']
    
    for v in vocabulary:
        wi_given_spam = spam[v].sum()
        parameters_spam[v] = (wi_given_spam+a)/(n_spam + a*n_vocabulary)
        
        wi_given_ham = ham[v].sum()
        parameters_ham[v] = (wi_given_ham+a)/(n_ham + a*n_vocabulary)
    
    return(parameters_spam, parameters_ham)

In [14]:
parameters_spam, parameters_ham = calc_params()

# Predicting on New Messages

In [15]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        else:
            next
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
        else:
            next
        


    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [16]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

'spam'

In [17]:
classify("Sounds good, Tom, then see u there")

'ham'

In [18]:
test_spam['predicted'] = test_spam['SMS'].apply(classify)

# Measure Accuracy

In [19]:
correct = 0
total = test_spam.shape[0]

for i,s in test_spam.iterrows():
    if s['Label'] == s['predicted']:
        correct += 1
        
print('Accuracy Rate: {0}'.format(correct/total))

Accuracy Rate: 0.952423698384201
