In [1]:
# Imports
import mailbox
import csv
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import re
import pandas as pd
import numpy as np
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# to make prior
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

# to run models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import RandomForestClassifier

In [2]:
# parsing emails into a csv file, uses cleanMe() to remove unwanted data


def createCSV(name, mbox, mes_type):
    #createing SPAM csv
    writer = csv.writer(open(name, "w", encoding="utf-8"))
    writer.writerow(['Subject', 'From', 'Date', 'Message-ID', 'Payload', 'Type'])
    for message in mailbox.mbox(mbox):

        if message.is_multipart():
            content = ''.join(str(part.get_payload(decode=True)) for part in message.get_payload())
            content = cleanMe(content)
        else:
            content = str(message.get_payload(decode=True))
            content = cleanMe(content)
        content = content.split()
        print(content)
        writer.writerow([message['subject'], message['from'], message['date'], message['Message-Id'], content, mes_type])
    

In [3]:
# Cleaning emails, removing stop words
## getting stop words
stop_words = set(stopwords.words('english'))

## cleanMe: cleans body of email, removing javascript, css, blanklines, escape characters, links
## and stop words. Also changes everything to lowercase

def cleanMe(html):
    soup = bs(html,"lxml") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n '.join(chunk for chunk in chunks if chunk)
    text = text.replace('\\n', ' ').replace('\\t', ' ').replace("\n", ' ').replace('\\b', ' ')
    # gets rid of escape characters
    text = bytes(text, "utf-8").decode("unicode_escape")
    # gets rid of links 
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = " ".join(re.findall(r'\s+[a-zA-Z]+\s+', text))
    # make text lower case
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    # remove stop words before csv was written
    text = word_tokenize(text)
    resultwords  = [word for word in text if word not in stop_words]
    result = ' '.join(resultwords)
    return result


In [4]:
# Writes CSV calling createCSV
#createCSV('./data/spam_t1.csv', './data/Spam.mbox', 'Spam')
#createCSV('./data/inbox_t1.csv', './data/Inbox.mbox', 'Ham')

In [5]:
# importing CSV into dataframe
spam = pd.read_csv("./data/spam_t1.csv")
ham = pd.read_csv("./data/inbox_t1.csv")
print('spam shape: ', spam.shape)
print('ham shape: ', ham.shape)

# ham has two years worth of emails, index starting from today and 
# ending way back to 2016. Trimming dataset to be more managable
ham = ham[0 : 1000]
print('ham new shape: ', ham.shape)

spam shape:  (707, 6)
ham shape:  (16238, 6)
ham new shape:  (1000, 6)


In [6]:
# getPrior takes data and result and returns the probablity of a single word 
# in the labeled email (hame or spam): columns are words, rows are emails
vec = TfidfVectorizer()
def getPrior(data, result):
    X = vec.fit_transform(data)
    prior = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
    prior['RESULT'] = result
    return prior

In [7]:
# 0 == ham, 1 == spam
spam_prior = getPrior(spam.Payload, 0)
ham_prior = getPrior(ham.Payload, 1)

# concat ham and spam prior to make prior feature set
# replace NaN with zero because it means that spam did not have that word, or spam did not have that word
frames = [spam_prior, ham_prior]
prior = pd.concat(frames).fillna(0)

In [8]:
# split target and data
target = prior.RESULT
data = prior.drop(["RESULT"], axis = 1)

# spliting train and test data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33)


In [13]:
# running Multinomial Naive Bayse, Random Forest, Linear SVC
# without cross validation

model1 = MultinomialNB()
model2 = LinearSVC()
model3 = RandomForestClassifier()

model1.fit(X_train,y_train)
model2.fit(X_train, y_train)
# for extra credit :)
model3.fit(X_train, y_train)

result1 = model1.predict(X_test)
result2 = model2.predict(X_test)
result3 = model3.predict(X_test)

# cheking results without cross validation

print("Confusion Matrix for MultinomialNB:\n", confusion_matrix(y_test,result1))
print("Confusion Matrix for SVC:\n", confusion_matrix(y_test,result2))
print("Confusion Matrix for RandomForest:\n", confusion_matrix(y_test, result3))



Confusion Matrix for MultinomialNB:
 [[122 118]
 [  2 322]]
Confusion Matrix for SVC:
 [[214  26]
 [ 33 291]]
Confusion Matrix for RandomForest:
 [[213  27]
 [ 24 300]]


In [14]:
# creating parameters for pipeline for cross validation: 
# random forest
pg_3 = {'randomforestclassifier__n_estimators': np.arange(1,10)}

# multinomial Niave Bayse
pg_1 = {'multinomialnb__alpha': np.arange(.5,1.5),
        'multinomialnb__fit_prior': [True, False]}
# svc classifier
pg_2 = {'linearsvc__dual': [True, False],
        'linearsvc__fit_intercept': [True, False]}

In [15]:
# pipline using cross validation GridSearchCV to find best parameters 
def run_model(model, param_grid, xtrain, ytrain, name, do_pca = False):
    scaler = MinMaxScaler()
    if(do_pca == True):
        pca = PCA(n_components = 10)
        pipe = make_pipeline(pca, model)
    else:
        pipe = make_pipeline(model)
    grid = GridSearchCV(pipe,param_grid)
    grid.fit(xtrain, ytrain)
    grid.best_params_
    accuracy = grid.score(xtrain, ytrain)
    print(f"In-sample accuracy: {accuracy:0.2%}", name)
    return(grid)

In [16]:
grid3 = run_model(model3, pg_3, X_train, y_train, "Random Forest")
grid1 = run_model(model1, pg_1, X_train, y_train, "Multinomial NB")
grid2 = run_model(model2, pg_2, X_train, y_train, "Linear SVC")

In-sample accuracy: 99.04% Random Forest
In-sample accuracy: 95.19% Multinomial NB
In-sample accuracy: 98.86% Linear SVC


In [17]:
# pipeline REALLY helps naive Bayse, however, I think it is overfitting 
# in linearCSV and Random Forest
preds3 = grid3.predict(X_test)
preds2 = grid2.predict(X_test)
preds1 = grid1.predict(X_test)
matrix3 = confusion_matrix(y_test, preds3)
print(matrix3)
matrix2 = confusion_matrix(y_test, preds2)
print(matrix2)
matrix1 = confusion_matrix(y_test, preds1)
print(matrix1)

[[212  28]
 [ 26 298]]
[[214  26]
 [ 33 291]]
[[183  57]
 [ 23 301]]
