# Implementation Naive Bayes Spam Filtering from scratch

### Author: Sunwoo Choi

### Data
reference: https://github.com/justmarkham/DAT5/blob/master/data/SMSSpamCollection.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import re
data = np.loadtxt(fname='/content/drive/MyDrive/Colab Notebooks/ML/SMSSpamCollection.txt', dtype=str, delimiter='\t')

data_size = data.shape[0]

# Set seed and shuffle
np.random.seed(1337)
np.random.shuffle(data)



In [None]:
data = data.tolist()

# cleaning up data set
# 1. delete all punctuations
# 2. split the sting into words
for i in range(data_size):
  data[i][1] = re.sub(r'[^A-Za-z0-9\']+', ' ',data[i][1].lower())
  data[i][1] = data[i][1].split()

split_idx = (int) (len(data)*0.7)
print(split_idx)

3901


In [None]:
# splite the whole data set to 70% training set and 30% validation set
train_set = data[:split_idx]
test_set = data[split_idx:]

print(len(train_set)+len(test_set))

5574


In [None]:
# Dictionary to check number of total, non-spam, spam message
entire = { "ham":0, "spam":0, "total":0 } 
# Word counting dictionary in spam/non-spam message
ham_word, spam_word = {}, {}
# Total number of non-spam/ spam/ unique words
total_ham_word, total_spam_word, total_unique_word = 0, 0, 0


# Check every word in the training set and counting number of words
for i in range(len(train_set)):
  entire[train_set[i][0]] += 1
  for word in train_set[i][1]:
    if train_set[i][0] == "ham":
      if word in ham_word:
        ham_word[word] = ham_word[word] + 1
      else:
        ham_word[word] = 1
        total_unique_word += 1
      total_ham_word += 1
    else:
      if word in spam_word:
        spam_word[word] = spam_word[word] + 1
      else:
        spam_word[word] = 1
        total_unique_word += 1
      total_spam_word += 1

entire["total"] = len(train_set)

print(entire)
print(ham_word)
print(spam_word)
print(total_unique_word)

{'ham': 3386, 'spam': 515, 'total': 3901}
{'double': 11, 'your': 181, 'mins': 19, 'txts': 9, 'on': 107, 'orange': 19, 'or': 127, '1': 73, '2': 132, 'price': 10, 'linerental': 3, 'motorola': 6, 'and': 87, 'sonyericsson': 2, 'with': 67, 'b': 10, 'tooth': 1, 'free': 164, 'nokia': 53, 'call': 229, 'mobileupd8': 8, '08000839402': 9, 'or2optout': 1, 'hv9d': 1, 'freemsg': 9, 'txt': 117, 'to': 497, 'no': 48, '86888': 2, 'claim': 78, 'reward': 7, 'of': 67, '3': 32, 'hours': 2, 'talk': 5, 'time': 14, 'use': 9, 'from': 85, 'phone': 37, 'now': 153, 'subscribe6gbp': 2, 'mnth': 2, 'inc': 6, '3hrs': 2, '16': 46, 'stop': 83, 'txtstop': 2, 'entry': 21, 'in': 56, 'a': 256, 'wkly': 11, 'comp': 8, 'win': 39, 'fa': 4, 'cup': 4, 'final': 12, 'tkts': 3, '21st': 2, 'may': 6, '2005': 3, 'text': 86, '87121': 4, 'receive': 21, 'question': 6, 'std': 6, 'rate': 21, 't': 53, "c's": 14, 'apply': 19, "08452810075over18's": 2, 'ur': 108, '250': 12, 'sms': 28, 'messages': 10, 'ok': 4, '84025': 2, 'web2mobile': 1, 'mate

In [None]:
# probabilty of receiving non spam message
p_ham = entire['ham']/entire['total']
# probabilty of receiving spam message
p_spam = entire['spam']/entire['total'] 

print(p_ham)
print(p_spam)

0.8679825685721609
0.132017431427839


In [None]:
num_of_success = 0
total_test = 0
alpha = 1 # slack variable

# testing the model

for i in range(len(test_set)):
  label = test_set[i][0]
  prob_ham, prob_spam = p_ham, p_spam # P(spam), P(ham)
  for word in test_set[i][1]:
    num_ham = ham_word[word] if word in ham_word else 0
    num_spam = spam_word[word] if word in spam_word else 0
    prob_ham *= (num_ham + alpha)/(total_ham_word+total_unique_word) # P(word|ham)
    prob_spam *= (num_spam+ alpha)/(total_spam_word+total_unique_word) # P(word|spam)

  pred_label = 'ham' if prob_ham > prob_spam else 'spam' 
  if label == pred_label: # If prediction succeed, increase number of success by 1.
    num_of_success += 1
  total_test += 1 

print(f"The accuracy of naive bayes classifier is {num_of_success/total_test*100}%")  

The accuracy of naive bayes classifier is 98.08726838015541%
