# Read Data

In [1]:
import pandas as pd
import numpy as np 

data_dir = "naive_bayes/email/input/"
df = pd.read_csv(data_dir + '/spam.csv', encoding='latin-1')  

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Split Data into Training and Testing Sets

In [2]:
from sklearn.model_selection import train_test_split

data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2, #text
    df.v1, #label
    test_size=0.2, 
    random_state=0) #seed number

# Transform Data to Fit Model Rrequirement

In [3]:
#create dict
def getVocab(data):
    vocab_dict = {}
    wid = 0
    
    for document in data:
        words = document.split(" ")
        for word in words:
            word = word.lower()
            if word not in vocab_dict:
                vocab_dict[word] = wid
                wid += 1
    return vocab_dict

In [10]:
#vectorize document
def doc2Vector(vocab_dict, document):
    word_vector = np.zeros(len(vocab_dict.keys()))
    words = document.split(" ")
    out_of_voc = 0
    
    for word in words:
        word = word.lower()
        if word in vocab_dict:
            word_vector[vocab_dict[word]] += 1
        else:
            out_of_voc += 1
    return word_vector, out_of_voc

In [11]:
vocab_dict = getVocab(data_train)
train_matrix = []
for document in data_train.values:
    word_vector, _ = doc2Vector(vocab_dict, document)
    train_matrix.append(word_vector)

print(len(train_matrix))

4457


# Train Model and Predict

In [15]:
def naiveBayes_train(train_matrix, labels_train):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    spam_word_counter = np.ones(num_words)
    ham_word_counter = np.ones(num_words)
    
    spam_total_words = 0
    ham_total_words = 0
    
    spam_count = 0
    ham_count = 0
    
    for i in range(num_docs):
        if labels_train[i] == 'ham':
            ham_word_counter += train_matrix[i]
            ham_total_words += sum(train_matrix[i])
            ham_count += 1
        else:
            spam_word_counter += train_matrix[i]
            spam_total_words += sum(train_matrix[i])
            spam_count += 1
    
    p_spam_vector = np.log(spam_word_counter / (spam_total_words + num_words))
    p_ham_vector = np.log(ham_word_counter / (ham_total_words + num_words))
    p_spam = np.log(spam_count / num_docs)
    p_ham = np.log(ham_count / num_docs)
    
    return p_spam_vector, p_ham_vector, p_spam, p_ham, spam_total_words, ham_total_words

p_spam_vector, p_ham_vector, p_spam, p_ham, spam_total_words, ham_total_words = naiveBayes_train(train_matrix, labels_train.values)

In [23]:
def predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham, spam_smoothing, ham_smoothing):
    
    spam = sum(test_word_vector * p_spam_vector) + p_spam + spam_smoothing
    ham = sum(test_word_vector * p_ham_vector) + p_ham + ham_smoothing
    
    if spam > ham:
        return 'spam'
    else:
        return 'ham'
    
num_words = len(vocab_dict.keys())
predictions = []

for doccument in data_test.values:
    test_word_vector, out_of_voc = doc2Vector(vocab_dict, doccument)
    if out_of_voc == 0:
        spam_smoothing = 0
        ham_smoothing = 0
    else:
        spam_smoothing = np.log(out_of_voc / (spam_total_words + num_words))
        ham_smoothing = np.log(out_of_voc / (ham_total_words + num_words))
    temp = predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham, spam_smoothing, ham_smoothing)
    predictions.append(temp)

# Check Accuracy

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

print(accuracy_score(labels_test, predictions))
print(classification_report(labels_test, predictions))
print(confusion_matrix(labels_test, predictions))

0.97847533632287
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       949
        spam       0.98      0.87      0.92       166

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[946   3]
 [ 21 145]]
