In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import os
import re

# Data Preparation

In [2]:
path_folder = ['train/ham', 'train/spam', 'test/ham', 'test/spam']
for path in path_folder:
    all_file_list=os.listdir(path)
    if(path == 'train/ham'):
        train_ham = pd.DataFrame({'Label':['ham'] * len(all_file_list), 'Content':[0] * len(all_file_list)})
        for i in range(len(all_file_list)):
            try:
                with open(path + '/' + all_file_list[i],'r', encoding = 'utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                pass;
            train_ham.loc[i, 'Content'] = content
    elif(path == 'train/spam'):
        train_spam = pd.DataFrame({'Label':['spam'] * len(all_file_list), 'Content':[0] * len(all_file_list)})
        for i in range(len(all_file_list)):
            try:
                with open(path + '/' + all_file_list[i],'r', encoding = 'utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                pass;
            train_spam.loc[i, 'Content'] = content
    elif(path == 'test/ham'):
        test_ham = pd.DataFrame({'Label':['ham'] * len(all_file_list), 'Content':[0] * len(all_file_list)})
        for i in range(len(all_file_list)):
            try:
                with open(path + '/' + all_file_list[i],'r', encoding = 'utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                pass;
            test_ham.loc[i, 'Content'] = content
    else:
        test_spam = pd.DataFrame({'Label':['spam'] * len(all_file_list), 'Content':[0] * len(all_file_list)})
        for i in range(len(all_file_list)):
            try:
                with open(path + '/' + all_file_list[i],'r', encoding = 'utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                pass;
            test_spam.loc[i, 'Content'] = content    

In [3]:
def process_message(message):
    message = message.lower()
    message = message.replace('subject','',1) #remove the first "subject"
    message = re.sub(r'\d+', '', message) #remove numbers
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    #stop word
    sw = stopwords.words('english')
    words = [word for word in words if word not in sw]
    #stemmer
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]   
    return words

In [4]:
train_set = pd.concat([train_ham, train_spam], axis = 0).reset_index(drop = True)
train_set = shuffle(train_set)
train_set = train_set.iloc[:10000].reset_index(drop = True)
test_set = pd.concat([test_ham, test_spam], axis = 0).reset_index(drop = True)
test_set = shuffle(test_set)
test_set = test_set.iloc[:2000].reset_index(drop = True)
train_set['Content'] = train_set['Content'].apply(process_message)

In [17]:
test_set[test_set.Label == 'spam']

Unnamed: 0,Label,Content,predicted
0,spam,Subject: i can see your on dialup\nhow are you...,spam
1,spam,Subject: message subject\n- - - - 158806477142...,spam
4,spam,Subject: i want to mentor you\nthis week i sho...,spam
9,spam,"Subject: scotty\nkelly ,\ngovenment don ' t wa...",spam
10,spam,"Subject: re , your pharmacy o , rder # 845235\...",spam
...,...,...,...
1980,spam,Subject: cheap oem soft shipping worldwide\nwh...,spam
1986,spam,Subject: success in dating - written by women ...,spam
1987,spam,Subject: [ none ]\ncraving for a luxury wwatch...,spam
1991,spam,Subject: from hajia mariam abacha and the chil...,spam


### Create vocabulary

In [5]:
vocabulary = []
for content in train_set['Content']:
    for word in content:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [6]:
len(vocabulary)

60298

In [7]:
word_counts_per_content = {unique_word: [0] * len(train_set['Content']) for unique_word in vocabulary}

for index, content in enumerate(train_set['Content']):
    for word in content:
        word_counts_per_content[word][index] += 1

In [8]:
word_counts = pd.DataFrame(word_counts_per_content)
word_counts.head()

Unnamed: 0,brean,kio,bettor,criterionalp,cuidad,rip,product,niamh,waken,arteriol,...,woud,wyrazici,blacklin,mtg,ujnti,riskbenc,stripteas,ormoan,melendez,leopard
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train_set_clean = pd.concat([train_set, word_counts], axis=1)
train_set_clean.head()

Unnamed: 0,Label,Content,brean,kio,bettor,criterionalp,cuidad,rip,product,niamh,...,woud,wyrazici,blacklin,mtg,ujnti,riskbenc,stripteas,ormoan,melendez,leopard
0,ham,"[california, capac, report, week, transwestern...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[remind, real, time, interview, today, jake, a...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[aussi, save, medlcatlon, pharma, rectangular,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,spam,"[minut, hard, rock, ever, stuck, outrag, presc...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,spam,"[ambllen, alprazzolam, aluum, llgra, caall, le...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Isolating spam and ham messages first
spam_messages = train_set_clean[train_set_clean['Label'] == 'spam']
ham_messages = train_set_clean[train_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(train_set_clean)
p_ham = len(ham_messages) / len(train_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['Content'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['Content'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [11]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [12]:
def classify(message):
    '''
    message: a string
    '''

    message = process_message(message)
    
    h_nb_spam = np.log(p_spam)
    h_nb_ham = np.long(p_ham)

    for word in message:
        if word in parameters_spam:
            h_nb_spam += np.log(parameters_spam[word])
            
        if word in parameters_ham: 
            h_nb_ham += np.log(parameters_ham[word])

    if (h_nb_ham > h_nb_spam):
        return 'ham'
    elif (h_nb_ham < h_nb_spam):
        return 'spam'
    else:
        return 'needs human classification'

In [13]:
test_set['predicted'] = test_set['Content'].apply(classify)
test_set.head()

Unnamed: 0,Label,Content,predicted
0,spam,Subject: i can see your on dialup\nhow are you...,spam
1,spam,Subject: message subject\n- - - - 158806477142...,spam
2,ham,"Subject: re : credit . com cv ' s\nshirley ,\n...",ham
3,ham,Subject: enron mentions\nenron taps $ 3 billio...,ham
4,spam,Subject: i want to mentor you\nthis week i sho...,spam


In [14]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1947
Incorrect: 53
Accuracy: 0.9735
