# Naive Bayesian Spam Filter

This is the code for a Naive Bayesian Spam Filter fit to the "SMS Spam Collection Dataset" from Kaggle<sup>1</sup>.

In [1]:
import csv
import pandas as pd

In [23]:
# def load_data(path):
#     data = []
    
#     with open(path) as csvfile:
#         reader = csv.reader(csvfile)
        
#         for row in reader:
#             if row[0] == 'spam':
#                 label = 1
#             elif row[0] == 'ham':
#                 label = 0
#             else:
#                 continue
            
#             content = row[1]
            
#             content = content.replace('\W+', ' ').replace('\s+', ' ').strip()
#             content = content.lower()
#             content = content.split()
            
#             data.append([label, content])
    
#     return data

sms_data = pd.read_csv('spam.csv', encoding='latin-1')
sms_data = sms_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
sms_data = sms_data.rename(columns={'v1': 'LABEL', 'v2': 'SMS'})

print(sms_data_clean.head())

  LABEL                                                SMS
0   ham  [go, until, jurong, point, crazy, available, o...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, in, 2, a, wkly, comp, to, win, f...
3   ham  [u, dun, say, so, early, hor, u, c, already, t...
4   ham  [nah, i, don, t, think, he, goes, to, usf, he,...


In [24]:
sms_data_clean['SMS'] = sms_data['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

print(sms_data_clean.head())

  LABEL                                                SMS
0   ham  [go, until, jurong, point, crazy, available, o...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, in, 2, a, wkly, comp, to, win, f...
3   ham  [u, dun, say, so, early, hor, u, c, already, t...
4   ham  [nah, i, don, t, think, he, goes, to, usf, he,...


[^1]: UCI Machine Learning. [n.d.]. SMS Spam Collection Dataset. Kaggle: Your Machine Learning and Data Science Community https:// www.kaggle.com/uciml/sms-spam-collection-dataset ([n. d.]).

In [15]:
train_data = sms_data_clean.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print(sms_data_clean['LABEL'].value_counts() / sms_data_clean.shape[0] * 100)
print(train_data['LABEL'].value_counts() / train_data.shape[0] * 100)
print(test_data['LABEL'].value_counts() / test_data.shape[0] * 100)

ham     86.593683
spam    13.406317
Name: LABEL, dtype: float64
ham     86.40646
spam    13.59354
Name: LABEL, dtype: float64
ham     86.983842
spam    13.016158
Name: LABEL, dtype: float64


In [16]:
vocabulary = list(set(train_data['SMS'].sum()))
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

print(train_data.head())

  LABEL                                                SMS  arng  notxt  \
0   ham                     [convey, my, regards, to, him]     0      0   
1   ham       [û_, anyway, many, good, evenings, to, u, s]     0      0   
2   ham  [my, sort, code, is, and, acc, no, is, the, ba...     0      0   
3   ham                  [sorry, i, din, lock, my, keypad]     0      0   
4  spam  [hi, babe, its, chloe, how, r, u, i, was, smas...     0      0   

   christians  goodies  pages  09095350301  such  year  ...  second  flirt  \
0           0        0      0            0     0     0  ...       0      0   
1           0        0      0            0     0     0  ...       0      0   
2           0        0      0            0     0     0  ...       0      0   
3           0        0      0            0     0     0  ...       0      0   
4           0        0      0            0     0     0  ...       0      0   

   breath  nic  85  stress  09058097218  80082  mad  08704439680  
0       0    

In [29]:
prob_spam = train_data['LABEL'].value_counts()['spam'] / train_data.shape[0]
prob_ham = train_data['LABEL'].value_counts()['ham'] / train_data.shape[0]

num_spam = train_data.loc[train_data['LABEL'] == 'spam', 'SMS'].apply(len).sum()
num_ham = train_data.loc[train_data['LABEL'] == 'ham', 'SMS'].apply(len).sum()
vocab_size = len(train_data.columns) - 3

alpha = 1

In [20]:
def prob_if_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['LABEL'] == 'spam', word].sum() + alpha) / (num_spam + alpha * vocab_size)
    else:
        return 1
    
def prob_if_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['LABEL'] == 'ham', word].sum() + alpha) / (num_ham + alpha * vocab_size)
    else:
        return 1

In [37]:
def classify(message):
    prob_message_is_spam = prob_spam
    prob_message_is_ham = prob_ham
    
    for word in message:
        prob_message_is_spam *= prob_if_spam(word)
        prob_message_is_ham *= prob_if_ham(word)
    
    if prob_message_is_spam > prob_message_is_ham:
        return 'spam', prob_message_is_spam, prob_message_is_ham
    else:
        return 'ham', prob_message_is_spam, prob_message_is_ham

In [39]:
print(classify('Ringtone Club: Get the UK singles chart on your mobile each week and choose any top quality ringtone! This message is free of charge.'))

('ham', 5e-324, 9.373544124290576e-301)


In [52]:
def grade():
    count = 0
    correct = 0
    
    for _, row in test_data.iterrows():
        count += 1
        if count < 5:
            print(row['LABEL'])
        if classify(row['SMS'])[0] == row['LABEL']:
            correct += 1
    
    return correct / count, correct, count

In [53]:
print(grade())

spam
ham
ham
ham
(0.9919210053859964, 1105, 1114)
