In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [6]:
df = pd.read_csv('data/SMS_SPAM/SMSSpamCollection.csv', sep = '\t', header=None)
df = df.sample(frac=1, axis=1).sample(frac=1).reset_index(drop=True)

split_frac = 0.8
split_idx = round(split_frac * df.shape[0])

df_train = df.iloc[:split_idx, :]
df_test = df.iloc[split_idx:, :]


In [7]:
# first, we need to create the features from text. we are interested in createing 
# a dictionary of words. Requires splitting on space and removing punctuation.

In [8]:
import re
from collections import Counter

doc_counter = {}
doc_counter['ham'] = Counter()
doc_counter['spam'] = Counter()
for label in ['ham', 'spam']:
    for i in range(df_train.shape[0]):
        curr_label = df_train.loc[i, 0]
        if curr_label == label:
            example = df_train.loc[i, 1]
            words = example.split(' ')
            words=[re.sub(r'[^\w\s]','', word) for word in words]
            doc_counter[label] = doc_counter[label] + Counter(list(set(words)))


In [9]:
y_train = df_train.iloc[:,0]
n = {}
n['ham'] = sum(y_train == 'ham') # number of documents that are ham
n['spam'] = sum(y_train == 'spam') # number of documents taht are spam
n_docs= n['ham']  +n['spam'] 
prior = {}
prior['ham'] = n['ham'] /n_docs
prior['spam'] = n['spam']/n_docs
num_words = len(doc_counter['ham'] + doc_counter['spam'])

prob_ham_test = np.zeros(shape = df_test.shape[0])
for i in range(df_test.shape[0]):
    example = df_test.iloc[i, 1]
    words = example.split(' ')
    words = [re.sub(r'[^\w\s]','', word) for word in words]
    words = list(set(words))
    
    prob = {}
    for label in ['ham', 'spam']:
        likelihood = 1
        for word in words:
            prob_word_given_label = (doc_counter[label][word]+1)/(n[label] + num_words)
            likelihood = likelihood * prob_word_given_label
        prob[label] = likelihood * prior[label]
    
    prob_ham_scaled = prob['ham']/(prob['ham'] + prob['spam'])
    prob_ham_test[i] = prob_ham_scaled
prob_ham_test

array([1.        , 1.        , 0.99999882, ..., 1.        , 1.        ,
       0.99999685])

In [10]:
y_test = df_test.iloc[:,0]
is_ham_test =  np.array(1*(y_test == 'ham'))
is_ham_test

array([1, 1, 1, ..., 1, 1, 0])

In [11]:
threshold = 0.5
TP = sum(is_ham_test * (prob_ham_test > threshold))
FP = sum(np.logical_not(is_ham_test) * (prob_ham_test > threshold))
TN = sum(np.logical_not(is_ham_test) * (prob_ham_test < threshold))
FN = sum(is_ham_test * (prob_ham_test < threshold))

In [12]:
# Confusion matrix

# rows are labels
# cols are predictions

C = np.array([[TN, FP],
              [FN, TP]])
C

array([[113,  38],
       [  2, 961]])

In [13]:
precision = TP / (TP + FP)
precision 

0.9619619619619619

In [14]:
recall = TP / (TP + FN)
recall 

0.9979231568016614