In [None]:
import csv
import pandas as pd

# Naive Bayesian Spam Filter

This is the code to fit a Naive Bayesian Spam Filter to the "SMS Spam Collection Dataset" created by Tiago A. Almeida and José María Gómez Hidalgo.<sup>1</sup>.

# Data Preparation

We load the data from a csv file, the first and second columns are named 'v1' and 'v2,' which we rename to 'LABEL' and 'SMS.' The other columns are unnecessary so we remove them.

In [None]:
sms_data = pd.read_csv('spam.csv', encoding='latin-1')
sms_data = sms_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
sms_data = sms_data.rename(columns={'v1': 'LABEL', 'v2': 'SMS'})

print(sms_data.head())

We sample randomly from the data to create the train and test splits, 80% of the data is used for training.

In [None]:
train_data = sms_data.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

We clean the training and test data, replacing any punctuation with whitespace, as punctuation is not considered in the spam filter.

In [None]:
def clean(data):
    data_clean = data.copy()
    data_clean['SMS'] = data['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
    data_clean['SMS'] = data_clean['SMS'].str.lower()
    data_clean['SMS'] = data_clean['SMS'].str.split()

train_data_clean = clean(train_data)
test_data_clean = clean(test_data)

print(train_data_clean.head())

Here we turn each email in the train data into a vector of word counts where email['word'] denotes how often that word appears in the email.

In [None]:
vocabulary = list(set(train_data_clean['SMS'].sum()))
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data_clean.iterrows()], columns=vocabulary)
train_data_clean = pd.concat([train_data_clean.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

print(train_data_clean.head())

We save this to a csv so that the production model can use it, we also save the test data for later testing purposes.

In [None]:
with open('train_data.csv', 'w') as train_data_file:
    train_data_clean.to_csv(train_data_file)

with open('test_data.csv', 'w') as test_data_file:
    test_data.to_csv(test_data_file)

## The Filter

Now that the training data is prepared, we can classify emails. Below are the probabilities of an email being spam or ham.

In [None]:
prob_spam = train_data['LABEL'].value_counts()['spam'] / train_data.shape[0]
prob_ham = train_data['LABEL'].value_counts()['ham'] / train_data.shape[0]

num_spam = train_data.loc[train_data['LABEL'] == 'spam', 'SMS'].apply(len).sum()
num_ham = train_data.loc[train_data['LABEL'] == 'ham', 'SMS'].apply(len).sum()
vocab_size = len(train_data.columns) - 3

alpha = 1

With the above info, we can calculate the conditional probability that an email is spam or ham given that it contains some word.

In [None]:
def prob_if_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['LABEL'] == 'spam', word].sum() + alpha) / (num_spam + alpha * vocab_size)
    else:
        return 1
    
def prob_if_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['LABEL'] == 'ham', word].sum() + alpha) / (num_ham + alpha * vocab_size)
    else:
        return 1

Finally, we have the algorithm to predict whether an email is spam.

In [None]:
def classify(message):
    prob_message_is_spam = prob_spam
    prob_message_is_ham = prob_ham
    
    for word in message:
        prob_message_is_spam *= prob_if_spam(word)
        prob_message_is_ham *= prob_if_ham(word)
    
    if prob_message_is_spam > prob_message_is_ham:
        return 'spam', prob_message_is_spam, prob_message_is_ham
    else:
        return 'ham', prob_message_is_spam, prob_message_is_ham

## Performance

We grade the model based on its accuracy.

In [None]:
def grade():
    count = 0
    correct = 0
    
    for _, row in test_data.iterrows():
        count += 1
        if classify(row['SMS'])[0] == row['LABEL']:
            correct += 1
    
    return correct / count, correct, count

print(grade())

## References

1. http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/