In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [3]:
# Importing the dataset
dataset = pd.read_csv('data/spam.csv' , encoding = 'latin-1')

In [4]:
print(dataset)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [8]:
labels , messages = dataset['v1'].values , dataset['v2'].values

In [9]:
print(labels)
print(messages)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


In [11]:
word_set = set()
for message in messages:
    for word in message.split():
        word_set.add(word)
        
print(word_set)



In [26]:
word_list = list(word_set)
word_list = np.array(word_list)
print(len(word_list))

15585


In [28]:
# make a dictionary of words and their spam counts and ham counts
word_dict = {}
for word in word_list:
    word_dict[word] = (0,0)

for i in range(len(messages)):
    for word in messages[i].split():
        word_dict[word] = (word_dict[word][0] + (labels[i] == 'spam'), word_dict[word][1] + (labels[i] == 'ham'))
print(word_dict)



In [30]:
#write the dictionary to an excel file, write words and their spam and ham counts, total spam message number and total ham message number
df = pd.DataFrame(word_dict)
df = df.T
df.columns = ['spam_count', 'ham_count']
df['total_spam'] = len(labels[labels == 'spam'])
df['total_ham'] = len(labels[labels == 'ham'])
df.to_excel('data/word_spam_ham_counts.xlsx')


In [68]:
#read the words and their spam and ham counts, total spam message number and total ham message number from the excel file
df = pd.read_excel('data/word_spam_ham_counts.xlsx')
words = df['Unnamed: 0'].values
spam_counts = df['spam_count'].values
ham_counts = df['ham_count'].values
total_spam = df['total_spam'].values[0]
total_ham = df['total_ham'].values[0]

In [69]:
# calculate the probability of a word being spam or ham
p_spam_word = spam_counts / total_spam
p_ham_word = ham_counts / total_ham
p_spam = total_spam / (total_spam + total_ham)
p_ham = total_ham / (total_spam + total_ham)


In [70]:
print(p_spam_word)
print(p_ham_word)
print(p_spam)
print(p_ham)

[0.         0.         0.00133869 ... 0.         0.         0.        ]
[0.00041451 0.00041451 0.         ... 0.00062176 0.00020725 0.00020725]
0.13406317300789664
0.8659368269921034


In [73]:
def predict_spam_or_ham(words, message , p_spam_word , p_ham_word, p_spam = 0.5, p_ham = 0.5):
    p_spam_message = p_spam
    p_ham_message = p_ham
    for word in message.split():
        if word in words:
            p_spam_message *=  p_spam_word[words == word][0]
            p_ham_message *=  p_ham_word[words == word][0]
    return p_spam_message, p_ham_message

In [74]:
print(predict_spam_or_ham(words, 'I am going to school', p_spam_word, p_ham_word, p_spam, p_ham))

(0.0, 3.791373475744043e-07)


In [81]:
def test_spam_or_ham_model(words, messages, labels, p_spam_word, p_ham_word, p_spam = 0.5, p_ham = 0.5):
    # calculate true positive, true negative, false positive and false negative
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(messages)):
        p_spam_message, p_ham_message = predict_spam_or_ham(words, messages[i], p_spam_word, p_ham_word, p_spam, p_ham)
        if p_spam_message > p_ham_message:
            if labels[i] == 'spam':
                tp += 1
            else:
                fp += 1
        else:
            if labels[i] == 'ham':
                tn += 1
            else:
                fn += 1
    return tp, tn, fp, fn

In [82]:
#Test the model
tp, tn, fp,fn = test_spam_or_ham_model(words, messages, labels, p_spam_word, p_ham_word, p_spam, p_ham)
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
print(accuracy, precision, recall, f1)


0.9989231873653984 0.9920318725099602 1.0 0.996
