In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('E:\EEP595EnronEmailProject\emails'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

E:\EEP595EnronEmailProject\emails\emails.csv


In [2]:
import matplotlib.pyplot as plt
import re
import string
import time
pd.set_option('display.max_rows', 50)

from nltk.corpus import stopwords
stop = stopwords.words('english')

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
df = pd.read_csv(".\cleaned_data.csv")

# view first 5 rows of the dataframe
df.head()

Unnamed: 0,subject,X-Folder,body
0,Re:,'sent mail,Traveling to have a business meeting takes the...
1,Re: test,'sent mail,test successful. way to go!!!
2,Re: Hello,'sent mail,Let's shoot for Tuesday at 11:45.
3,Re: Hello,'sent mail,"Greg,\n\n How about either next Tuesday or Thu..."
4,Re: PRC review - phone calls,'sent mail,any morning between 10 and 11:30


In [4]:
def remove_folders(emails, n):
    # returns the number of folders containing more than 'n' number of emails
    email_count = dict(df['X-Folder'].value_counts())
    small_folders = [key for key, val in email_count.items() if val<=n]
    emails = df.loc[~df['X-Folder'].isin(small_folders)]
    return emails

In [5]:
n = 150
df = remove_folders(df, n)

In [6]:
print("Total folders: ", len(df['X-Folder'].unique()))
print("df.shape: ", df.shape)

Total folders:  82
df.shape:  (460141, 3)


In [7]:
df['text'] = df['subject'] + " " + df['body']

In [8]:
# drop the columns 'subject' and 'body'
df.drop(['subject','body'], axis=1, inplace=True)

In [9]:
def preprocess(x):
    # lowercasing all the words
    x = x.lower()
    
    # remove extra new lines
    x = re.sub(r'\n+', ' ', x)
    
    # removing (replacing with empty spaces actually) all the punctuations
    x = re.sub("["+string.punctuation+"]", " ", x)
    
    # remove extra white spaces
    x = re.sub(r'\s+', ' ', x)
    
    return x

In [10]:
start = time.time()
df.loc[:,'text'] = df.loc[:, 'text'].map(preprocess)

# remove stopwords
df.loc[:, 'text'] = df.loc[:, 'text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
end = time.time()
print("Execution time (sec): ",(end - start))

Execution time (sec):  314.1476950645447


In [20]:
start = time.time()
folders_dict = dict(df['X-Folder'].value_counts().sort_values()[50:70])
data = df[df['X-Folder'].isin(folders_dict.keys())]
end = time.time()
print("Execution time (sec): ",(end - start))

Execution time (sec):  0.07878589630126953


In [21]:
# check number of rows in the 'data' dataframe
print("Number of instances: ", data.shape[0])
data.to_csv('preprocessed.csv', index=False)

Number of instances:  13586


In [22]:
data = pd.read_csv("preprocessed.csv")

In [23]:
data['X-Folder'].value_counts()

logistics              1170
tw-commercial group    1150
california             1014
bill williams iii      1004
deal discrepancies      878
management              799
calendar                700
esvl                    663
tufco                   604
resumes                 599
e-mail bin              592
ces                     572
online trading          567
junk                    544
junk file               494
ooc                     473
genco-jv_ipo            465
projects                459
corporate               420
archives                419
Name: X-Folder, dtype: int64

In [24]:
def label_encoder(data):
    class_le = LabelEncoder()
    # apply label encoder on the 'X-Folder' column
    y = class_le.fit_transform(data['X-Folder'])
    return y

In [25]:
y = label_encoder(data)
input_data = data['text']

In [26]:
start = time.time()
vectorizer = CountVectorizer(min_df=5, max_features=5000)
X = vectorizer.fit_transform(input_data)
end = time.time()
print("Execution time (sec): ",(end - start))

Execution time (sec):  1.8430697917938232


In [27]:
start = time.time()
X = X.toarray()
print("X.shape: ",X.shape)
end = time.time()
print("Execution time (sec): ",(end - start))

X.shape:  (13586, 5000)
Execution time (sec):  0.18051910400390625


In [28]:
# create dataframe to store results
f1_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}
f1_df = pd.DataFrame(f1_data)

jaccard_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB', 'Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}
jacc_df = pd.DataFrame(jaccard_data)

acc_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}
acc_df = pd.DataFrame(acc_data)
acc_df

Unnamed: 0,Algorithm,BoW
0,Gaussian NB,
1,Multinomial NB,
2,Decision Tree,
3,SVM,
4,AdaBoost,
5,ANN,


In [29]:
models = [GaussianNB(), MultinomialNB(), DecisionTreeClassifier(), LinearSVC(), 
          AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names = ["Gaussian NB", "Multinomial NB", "Decision Tree", "SVM", "AdaBoost", "ANN"]

jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models, names):
    print(name)
    start = time.time()
    scoring = {
        'acc': 'accuracy',
        'f1_mac': 'f1_macro',
        'jacc_mac': 'jaccard_macro'
    }
    scores = cross_validate(model, X, y, cv=10, n_jobs=4, scoring=scoring)
    training_time = (time.time() - start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)
    
acc_df['BoW'] = acc_scores
jacc_df['BoW'] = jacc_scores
f1_df['BoW'] = f1_scores
acc_df['time'] = exec_times
acc_df

Gaussian NB
accuracy:  0.5852325249983473
f1_score:  0.5621318716847321
Jaccard_index:  0.4130840479833916
time (sec):  272.79186511039734


Multinomial NB
accuracy:  0.7377434135166094
f1_score:  0.7038215928245647
Jaccard_index:  0.5770619060081098
time (sec):  282.37522053718567


Decision Tree
accuracy:  0.657735697542484
f1_score:  0.6389372213085271
Jaccard_index:  0.49339469800975644
time (sec):  173.0333297252655


SVM
accuracy:  0.7371561541937728
f1_score:  0.718565266331326
Jaccard_index:  0.587187559123094
time (sec):  63.77345585823059


AdaBoost
accuracy:  0.6657573846315568
f1_score:  0.6453542894393619
Jaccard_index:  0.5034223140112702
time (sec):  744.1982271671295


ANN
accuracy:  0.7367147072752317
f1_score:  0.7164593631485092
Jaccard_index:  0.5848075641734207
time (sec):  559.2389423847198




Unnamed: 0,Algorithm,BoW,time
0,Gaussian NB,0.585233,272.791865
1,Multinomial NB,0.737743,282.375221
2,Decision Tree,0.657736,173.03333
3,SVM,0.737156,63.773456
4,AdaBoost,0.665757,744.198227
5,ANN,0.736715,559.238942


In [30]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jacc_df.to_csv("jacc_score.csv", index=False)

In [31]:
start = time.time()
vectorizer = CountVectorizer(min_df=5, max_features=5000, ngram_range=(2,2))
X = vectorizer.fit_transform(input_data)

X = X.toarray()
print("X.shape: ",X.shape)

end = time.time()
print("Execution time (sec): ",(end - start))

X.shape:  (13586, 5000)
Execution time (sec):  13.650284767150879


In [32]:
models = [GaussianNB(), MultinomialNB(), DecisionTreeClassifier(), LinearSVC(), 
          AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names = ["Gaussian NB", "Multinomial NB", "Decision Tree", "SVM", "AdaBoost", "ANN"]

jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models, names):
    print(name)
    start = time.time()
    scoring = {
        'acc': 'accuracy',
        'f1_mac': 'f1_macro',
        'jacc_mac': 'jaccard_macro'
    }
    scores = cross_validate(model, X, y, cv=10, n_jobs=4, scoring=scoring)
    training_time = (time.time() - start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)
    
acc_df['BoWBi'] = acc_scores
jacc_df['BoWBi'] = jacc_scores
f1_df['BoWBi'] = f1_scores
acc_df['BoWBi_time'] = exec_times
acc_df

Gaussian NB
accuracy:  0.5833930454364673
f1_score:  0.5621651556732388
Jaccard_index:  0.4068105548950894
time (sec):  29.256702184677124


Multinomial NB
accuracy:  0.6374178145803735
f1_score:  0.6170933752131809
Jaccard_index:  0.4707424107547659
time (sec):  118.23734021186829


Decision Tree
accuracy:  0.590826822980165
f1_score:  0.5787776354705647
Jaccard_index:  0.4305232441708373
time (sec):  383.3654010295868


SVM
accuracy:  0.6322652344431547
f1_score:  0.6189001484588367
Jaccard_index:  0.47175795478690324
time (sec):  36.95018434524536


AdaBoost
accuracy:  0.5791974303205272
f1_score:  0.5659581642731262
Jaccard_index:  0.41918704137092505
time (sec):  736.5310423374176


ANN
accuracy:  0.6151167528753382
f1_score:  0.601077031643569
Jaccard_index:  0.4530523549573748
time (sec):  777.6779088973999




Unnamed: 0,Algorithm,BoW,time,BoWBi,BoWBi_time
0,Gaussian NB,0.585233,272.791865,0.583393,29.256702
1,Multinomial NB,0.737743,282.375221,0.637418,118.23734
2,Decision Tree,0.657736,173.03333,0.590827,383.365401
3,SVM,0.737156,63.773456,0.632265,36.950184
4,AdaBoost,0.665757,744.198227,0.579197,736.531042
5,ANN,0.736715,559.238942,0.615117,777.677909


In [33]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jacc_df.to_csv("jacc_score.csv", index=False)

In [34]:
start = time.time()
vectorizer = TfidfVectorizer(min_df=5, max_features=5000)
X = vectorizer.fit_transform(input_data)

X = X.toarray()
print("X.shape: ",X.shape)

end = time.time()
print("Execution time (sec): ",(end - start))

X.shape:  (13586, 5000)
Execution time (sec):  4.879719495773315


In [35]:
models = [GaussianNB(), MultinomialNB(), DecisionTreeClassifier(), LinearSVC(), 
          AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names = ["Gaussian NB", "Multinomial NB", "Decision Tree", "SVM", "AdaBoost", "ANN"]

jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models, names):
    print(name)
    start = time.time()
    scoring = {
        'acc': 'accuracy',
        'f1_mac': 'f1_macro',
        'jacc_mac': 'jaccard_macro'
    }
    scores = cross_validate(model, X, y, cv=10, n_jobs=4, scoring=scoring)
    training_time = (time.time() - start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)
    
acc_df['TfIdf'] = acc_scores
jacc_df['TfIdf'] = jacc_scores
f1_df['TfIdf'] = f1_scores
acc_df['TfIdf_time'] = exec_times
acc_df

Gaussian NB
accuracy:  0.6093018127120674
f1_score:  0.5877402363957523
Jaccard_index:  0.44084640698807825
time (sec):  73.84895420074463


Multinomial NB
accuracy:  0.7368567808999297
f1_score:  0.6967070564788325
Jaccard_index:  0.5701299709091912
time (sec):  7.48159122467041


Decision Tree
accuracy:  0.6461798884001384
f1_score:  0.6305718598295987
Jaccard_index:  0.48501956205184377
time (sec):  176.99774599075317


SVM
accuracy:  0.7947884663526091
f1_score:  0.7771822256420796
Jaccard_index:  0.6613918628186176
time (sec):  13.632520914077759


AdaBoost
accuracy:  0.6581030190916175
f1_score:  0.6384217127299128
Jaccard_index:  0.4969775044000473
time (sec):  744.375152349472


ANN
accuracy:  0.749522465730563
f1_score:  0.7306690801729734
Jaccard_index:  0.6033280521106238
time (sec):  801.2466208934784




Unnamed: 0,Algorithm,BoW,time,BoWBi,BoWBi_time,TfIdf,TfIdf_time
0,Gaussian NB,0.585233,272.791865,0.583393,29.256702,0.609302,73.848954
1,Multinomial NB,0.737743,282.375221,0.637418,118.23734,0.736857,7.481591
2,Decision Tree,0.657736,173.03333,0.590827,383.365401,0.64618,176.997746
3,SVM,0.737156,63.773456,0.632265,36.950184,0.794788,13.632521
4,AdaBoost,0.665757,744.198227,0.579197,736.531042,0.658103,744.375152
5,ANN,0.736715,559.238942,0.615117,777.677909,0.749522,801.246621


In [36]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jacc_df.to_csv("jacc_score.csv", index=False)

In [37]:
jacc_df

Unnamed: 0,Algorithm,BoW,BoWBi,TfIdf
0,Gaussian NB,0.413084,0.406811,0.440846
1,Multinomial NB,0.577062,0.470742,0.57013
2,Decision Tree,0.493395,0.430523,0.48502
3,SVM,0.587188,0.471758,0.661392
4,AdaBoost,0.503422,0.419187,0.496978
5,ANN,0.584808,0.453052,0.603328
