In [1]:
import glob
import numpy as np
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [2]:
# Step 1. Load the dataset into variables
#----------------------------------------------------------------------

emails = [] # Mail texts
labels = [] # Labels(0 for ham, 1 for spam)

# Load ham mails
for filename in sorted(glob.glob('enron1/ham/*.txt')):
    with open(filename, 'r', encoding='ISO-8859-1') as infile:
        emails.append(infile.read())
    labels.append(0)

# Load spam mails
for filename in sorted(glob.glob('enron1/spam/*.txt')):
    with open(filename, 'r', encoding='ISO-8859-1') as infile:
        emails.append(infile.read())
    labels.append(1)

len(emails), len(labels) # Print out how much data we've loaded is

(5172, 5172)

In [3]:
# Step 2. Clean the texts
#----------------------------------------------------------------------

def letters_only(astr):
    return astr.isalpha()

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(
            ' '.join([lemmatizer.lemmatize(word.lower())
                     for word in doc.split()
                         if letters_only(word)
                              and word not in all_names]))
    return cleaned_docs
            
cleaned_emails = clean_text(emails)

In [4]:
# First email before cleaning
emails[0]

'Subject: christmas tree farm pictures\n'

In [5]:
# First email after cleaning
cleaned_emails[0]

'christmas tree farm picture'

In [6]:
# Step 3. Split the dataset into train dataset and test dataset
#----------------------------------------------------------------------
X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=42)

In [7]:
# Step 4. Remove unused words and convert the dataset into a Document-Term Matrix
#----------------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=500) # the input text is in English, consider most frequent 500 words only
term_docs_train = cv.fit_transform(X_train)
term_docs_test = cv.transform(X_test)

In [8]:
# Step 5 (TODO). Apply the machine learning with a MultinomialNB object
#----------------------------------------------------------------------
# To achieve the goal:
# 1. Understand the entire codes
# 2. Search for MultinomialNB class and learn about it
# 3. Write correct codes below this comment
# 4. The code should train a Naive Bayes model and print out the model accuracy with the test dataset
# The expected model accuracy is 0.9197422378441711
#
# Hint1: Create a MultinomialNB object with the parameters: alpha=1.0 and fit_prior=True
# Hint2: Use fit() and score() method to get a result
#----------------------------------------------------------------------

# TODO: Write codes here

In [9]:
# Step 5 (TODO). Apply the machine learning with a MultinomialNB object with the parameters: alpha=1.0 and fit_prior=True.
#Print out the model accuracy with the test dataset
#----------------------------------------------------------------------
clf = MultinomialNB(alpha=1.0, fit_prior=True) #call MultinomialNB
clf.fit(term_docs_train, Y_train) #fit the model with tranining data
clf.score(term_docs_test, Y_test) #get mean accuracy on testing set

0.9179847685998829

In [10]:
#predict from testing det and compute confusion matrix
Y_predict = clf.predict(term_docs_test)
cfs_mat = confusion_matrix(Y_test, Y_predict)
print(cfs_mat)

[[1138   91]
 [  49  429]]
