# Building a Spam Filter with Naive Bayes
In this project, we'll build a spam filter for SMS messages using a dataset containing 5572 SMS messages that are already classified by humans, which was put together by Tiago A. Almeida and José María Gómez Hidalgo.

In [1]:
# Import the dataset
import pandas as pd

data = pd.read_csv("SMSSpamCollection", sep = "\t", header = None, names = ["label", "SMS"])

print(data.shape)

print(data.head())

# Check the percentages of spam and ham messages
print(data["label"].value_counts(normalize = True))

(5572, 2)
  label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
ham     0.865937
spam    0.134063
Name: label, dtype: float64


In [2]:
import numpy as np

data_sampled = data.sample(frac=1, random_state=1)

# Split into training and test sets
size = int(np.floor(len(data) * 0.8))
data_training = data_sampled[:(size + 1)].reset_index(drop = True) 
data_test = data_sampled[(size + 1):].reset_index(drop = True)

print(data_training.shape)
print(data_test.shape)

(4458, 2)
(1114, 2)


In [3]:
print(data_training["label"].value_counts(normalize = True))
print(data_test["label"].value_counts(normalize = True))

ham     0.86541
spam    0.13459
Name: label, dtype: float64
ham     0.868043
spam    0.131957
Name: label, dtype: float64


In [4]:
# Split SMS into words and clean the format
# Remove all the punctuation and format to lower case
data_training["SMS"] = data_training["SMS"].str.replace("\W", " ")
data_training["SMS"] = data_training["SMS"].str.lower()

data_training.head()

Unnamed: 0,label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [5]:
vocabulary = []

data_training["SMS"] = data_training["SMS"].str.split()

for sms in data_training["SMS"]:
    for word in sms:
        vocabulary.append(word)
        
# Transform vocabulary to a set and then back to a list to remove duplicated
vocabulary = list(set(vocabulary))

print(len(vocabulary))

7783


In [6]:
print(data_training["SMS"].head())

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object


In [7]:
# Calculate the number of occurrence of each word in each SMS
word_counts_per_sms = {unique_word: [0] * len(data_training['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(data_training['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
word_counts_df = pd.DataFrame(word_counts_per_sms)
print(word_counts_df.shape)
word_counts_df.head()

(4458, 7783)


Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [8]:
# Combine this with the original training set
data_training_cleaned = pd.concat([data_training, word_counts_df], axis=1)
print(data_training_cleaned.shape)
data_training_cleaned.head()

(4458, 7785)


Unnamed: 0,label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [9]:
sum(data_training_cleaned["label"].isnull())

0

In [10]:
# Calculate the probablities and numbers required for the naive Bayes
p_spam = data_training_cleaned["label"].value_counts(normalize = True)[1]
p_ham = data_training_cleaned["label"].value_counts(normalize = True)[0]
print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [11]:
spam_rows = data_training_cleaned[data_training_cleaned["label"] == "spam"]
spam_words_len = spam_rows["SMS"].apply(len)
n_spam = spam_words_len.sum()

ham_rows = data_training_cleaned[data_training_cleaned["label"] == "ham"]
ham_words_len = ham_rows["SMS"].apply(len)
n_ham = ham_words_len.sum()

n_voc = len(vocabulary)

# Set the smoothing parameter
alpha = 1

In [12]:
# Initialize two dicts for each word from vocabulary for spam and ham
dict_word_spam = {unique_word:0 for unique_word in vocabulary}
dict_word_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate the probability for each word
for word in vocabulary:
    n_word_spam = spam_rows[word].sum()
    p_word_given_spam = (n_word_spam + alpha) / (n_spam + alpha * n_voc)
    dict_word_spam[word] = p_word_given_spam
    
    n_word_ham = ham_rows[word].sum()
    p_word_given_ham = (n_word_ham + alpha) / (n_ham + alpha * n_voc)
    dict_word_ham[word] = p_word_given_ham

In [13]:
# Write the function to filter the spam SMS
import re

def classify(message):
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in dict_word_spam:
            p_spam_given_message *= dict_word_spam[word]
            
        if word in dict_word_ham:
            p_ham_given_message *= dict_word_ham[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [14]:
# Test
classify("WINNER!! This is the secret code to unlock the money: C3421.")
classify("Sounds good, Tom, then see u there")

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam
P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [15]:
# Calculate the overall accuracy

def classify_2(message):
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in dict_word_spam:
            p_spam_given_message *= dict_word_spam[word]
            
        if word in dict_word_ham:
            p_ham_given_message *= dict_word_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return "ham"
    elif p_ham_given_message < p_spam_given_message:
        return "spam"
    else:
        return("equal probability")

data_test["prediction"] = data_test["SMS"].apply(classify_2)
data_test.head()

Unnamed: 0,label,SMS,prediction
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [16]:
corret = 0
total = len(data_test)

for index, row in data_test.iterrows():
    if row["label"] == row["prediction"]:
        corret += 1

accuracy = corret / total
print(corret)
print(accuracy)

1100
0.9874326750448833
