In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm

In [2]:
SPAM = 'spam'
HAM = 'ham'
LABEL = 'label'
MESSAGE = 'message'

A storage class for the spam filter. This implementation stores data in python dictionary however
it can be implemented to store the data in a database table or file. The storage class keeps track 
of the number of messages that have been classified as either HAM or SPAM. It also stores the count 
of words belonging to each category and is able to give you the number of words belonging to each 
category in a sentence.

In [3]:
class DataStore:

    def __init__(self):
        self._storage = {}
        self._num_spam = 0
        self._num_ham = 0
        
        self._non_word_re = re.compile('[^\\w|\\S]',re.MULTILINE)
        self._tag_re = re.compile('<\\/?[\\w|\\s|\\=|\\:]+>',re.MULTILINE)
        self._word_re = re.compile('[a-zA-Z]+')
        
    def add_spam(self, message):
        self._add_entry(message,SPAM)
        
    def add_ham(self, message):
        self._add_entry(message,HAM)
        
    def _add_entry(self, message, label):
        words = self.get_words(message)
        for word in words:
            key = (label, word)
            count = self._storage.get(key, 0)
            self._storage[key] = count + 1
            
        if label == SPAM:
            self._num_spam += 1
        elif label == HAM:
            self._num_ham +=1
    
    def get_word_frequency(self, message, label, epsilon):
        words = self.get_words(message)
        freq = {w:self._storage.get((label, w),epsilon)  for w in words}
        return freq
    
    @property
    def num_spam(self):
        return self._num_spam
    
    @property
    def num_ham(self):
        return self._num_ham
    
    @property
    def num_messages(self):
        return self._num_ham + self._num_spam
    
    def get_words(self,message):
        words = []
        for line in self._non_word_re.split(' '.join(self._tag_re.split(message))):
            for word in line.split(' '):
                word = word.strip()
                for word in self._word_re.findall(word):
                    if len(word)==0:
                        continue
                    words.append(word)
        return words

This implement a spam filter using the naive bayes approach. It does that by comparing $P_{spam}$ with $P_{ham}$ where 
$$P_{spam}=\prod_{i=0}^{n} p(word_i|spam\_message)$$ 
$$P_{ham}=\prod_{i=0}^{n} p(word_i|ham\_message)$$ 


In [4]:
class NaiveBayesFilter:
    def __init__(self, store:DataStore):
        self._store = store
        self._epsilon = 1e-6
    def spamminess(self, message):
        freq = self._store.get_word_frequency(message, SPAM, self._epsilon)
        p_spam = np.array(list(freq.values()))/self._store.num_spam
        p_spam = (np.log(p_spam).sum() + np.log(self._store.num_spam / self._store.num_messages)).item()
        return p_spam
    
    def hamminess(self, message):
        freq = self._store.get_word_frequency(message, HAM, self._epsilon)
        p_ham = np.array(list(freq.values()))/self._store.num_ham
        p_ham = (np.log(p_ham).sum() + np.log(self._store.num_ham / self._store.num_messages)).item()
        return p_ham
    
    def is_spam(self, message):
        p_spam = self.spamminess(message)
        p_ham  = self.hamminess(message)
        result = p_spam > p_ham
        return result

In [5]:
dataset = pd.read_csv('data/spam.csv',encoding='latin',usecols=['v1','v2'])

In [6]:
dataset.rename(columns={
    'v1':LABEL,
    'v2':MESSAGE
},inplace=True)

In [7]:
dataset.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
dataset.groupby(by=LABEL).count()

Unnamed: 0_level_0,message
label,Unnamed: 1_level_1
ham,4825
spam,747


In [9]:
TRAIN_SIZE = 600
train_df = pd.concat([dataset[dataset['label']=='spam'][:TRAIN_SIZE],
                    dataset[dataset['label']=='ham'][:TRAIN_SIZE]])

test_df = pd.concat([dataset[dataset['label']=='spam'][TRAIN_SIZE:],
                    dataset[dataset['label']=='ham'][TRAIN_SIZE:]])

print(f''' 
Train Stats
{train_df.groupby(by='label').count()}

Test Stats
{test_df.groupby(by='label').count()}
''')

 
Train Stats
       message
label         
ham        600
spam       600

Test Stats
       message
label         
ham       4225
spam       147



In [10]:
store = DataStore()
filter = NaiveBayesFilter(store=store)

In [11]:
for i,row in tqdm(train_df.iterrows()):
    label = row[LABEL]
    message = row[MESSAGE]
    if label == SPAM:
        store.add_spam(message)
    elif label == HAM:
        store.add_ham(message)

0it [00:00, ?it/s]

1200it [00:00, 8185.01it/s]


In [12]:
filter.is_spam('Free entry in 2 a wkly comp to win FA Cup fina.')

True

In [13]:
accuracy = (test_df[LABEL]==test_df['message'].apply(lambda m: SPAM if filter.is_spam(m) else HAM)).mean().item()

In [14]:
print(f'Accuracy: {np.round(accuracy*100,2)}%')

Accuracy: 92.34%


### Conclustion
I am very impressed about the performance of the filter inspite of its simplicity.