# DS7333 Case Study 
##  Naive Bayes and Clustering

#### John Girard, Shijo Joseph, Douglas Yip

In [46]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import email
from bs4 import BeautifulSoup as BS4

In [48]:
count = 0
dir_count = 0
for root_dir, cur_dir, files in os.walk(".\SpamAssassinMessages"):
    dir_count = len(files)
    count += len(files)
    for names in cur_dir:
        print(names, len(os.listdir(os.path.join(root_dir, names))))
print('file count:', count)

3:43: W605 invalid escape sequence '\S'


easy_ham 1000
easy_ham_2 1000
hard_ham 501
spam 1000
spam_2 1000
file count: 4501


In [49]:
os.listdir(".\SpamAssassinMessages")

['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']

1:14: W605 invalid escape sequence '\S'


In [50]:
file_name = []
contents = []
types = []
labels = []
labelnames = []
message = ''

for root, dirs, files in os.walk(".\SpamAssassinMessages"):
    for name in files:
        with open(os.path.join(root, name),
                  'r', encoding='latin1') as f:
            message = ''
            try:
                x = email.message_from_file(f)
            except UnicodeDecodeError:
                print("Error in file: Unknown Error")
            if "multipart" in x.get_content_type():
                if x.is_multipart():
                    for part in x.get_payload():
                        message = message + part.as_string()
                contents.append(message)
            elif "text/plain" in x.get_content_type():
                contents.append(x.get_payload()
                                .replace("\n", " "))
            elif "text/html" in x.get_content_type():
                contents.append(BS4(x.get_payload()).get_text())
            types.append(x.get_content_type())
            if "ham" in root:
                labelnames.append('ham')
                labels.append(1)
            elif "spam" in root:
                labelnames.append('spam')
                labels.append(0)
            else:
                print("YOU HAVE A BIG PROBLEM--LABEL NOT FOUND")
            file_name.append(os.path.join(root, name))

8:36: W605 invalid escape sequence '\S'


In [51]:
types = pd.DataFrame(types)
types.shape
types.value_counts()

text/plain                     2996
text/html                      1008
multipart/alternative           265
multipart/mixed                 141
multipart/related                45
multipart/signed                 45
text/plain charset=us-ascii       1
dtype: int64

In [52]:
df_NB = pd.DataFrame()
df_NB['Filename'] = file_name
df_NB['types'] = types
df_NB['email_body'] = contents
df_NB['labelnames'] = labelnames
df_NB['labels'] = labels

In [53]:
df_NB

Unnamed: 0,Filename,types,email_body,labelnames,labels
0,.\SpamAssassinMessages\easy_ham\00001.7c53336b...,text/plain,"Date: Wed, 21 Aug 2002 10:54:46 -05...",ham,1
1,.\SpamAssassinMessages\easy_ham\00002.9c4069e2...,text/plain,"Martin A posted: Tassos Papadopoulos, the Gree...",ham,1
2,.\SpamAssassinMessages\easy_ham\00003.860e3c3c...,text/plain,Man Threatens Explosion In Moscow Thursday A...,ham,1
3,.\SpamAssassinMessages\easy_ham\00004.864220c5...,text/plain,Klez: The Virus That Won't Die Already the m...,ham,1
4,.\SpamAssassinMessages\easy_ham\00005.bf27cdea...,text/plain,"> in adding cream to spaghetti carbonara, whi...",ham,1
...,...,...,...,...,...
4496,.\SpamAssassinMessages\spam_2\00999.f46c3f4b40...,text/html,"\nHello, \nPremium Phone Qualified \nBusiness ...",spam,0
4497,.\SpamAssassinMessages\spam_2\01000.a6f2693762...,text/plain,Below is the result of your feedback form. It...,spam,0
4498,.\SpamAssassinMessages\spam_2\01001.742869a142...,text/plain,"Hi, I'm a college dropout. I work about two ...",spam,0
4499,.\SpamAssassinMessages\spam_2\01002.406c1c709e...,text/html,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spam,0


### Naive Bayes Portion

##### Using Count Vectorizer

In [73]:
# Import Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [74]:
training_data, testing_data = \
    train_test_split(df_NB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 3600
No. of testing examples: 901


In [75]:
the_count = CountVectorizer()

Xtrain = the_count.fit_transform(training_data['email_body'])

In [76]:
Xtrain = Xtrain.toarray()

In [77]:
# Fit Naive Bayes model
nb = MultinomialNB()
nb.fit(Xtrain, training_data['labels'])

In [78]:
Xtest = the_count.transform(testing_data['email_body'])
preds = nb.predict(Xtest.toarray())

In [79]:
preds.shape

(901,)

In [80]:
print(classification_report(testing_data['labels'], preds))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96       412
           1       0.95      0.99      0.97       489

    accuracy                           0.97       901
   macro avg       0.97      0.96      0.97       901
weighted avg       0.97      0.97      0.97       901



In [81]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)

[[386  26]
 [  4 485]]


##### Using TF-IDF Vectorizor

In [82]:
tfidf = TfidfVectorizer()

Xtrain = tfidf.fit_transform(training_data['email_body'])

In [83]:
Xtrain = Xtrain.toarray()

In [84]:
# Fit Naive Bayes model
nb = MultinomialNB()
nb.fit(Xtrain, training_data['labels'])

In [85]:
Xtest = tfidf.transform(testing_data['email_body'])
preds = nb.predict(Xtest.toarray())

In [86]:
preds.shape

(901,)

In [87]:
print(classification_report(testing_data['labels'], preds))

              precision    recall  f1-score   support

           0       0.97      0.90      0.94       412
           1       0.92      0.98      0.95       489

    accuracy                           0.94       901
   macro avg       0.95      0.94      0.94       901
weighted avg       0.95      0.94      0.94       901



In [88]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)

[[371  41]
 [ 10 479]]


In [70]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 1, 10],
}

clf2 = GridSearchCV(text_clf, parameters, cv=5)
clf2.fit(training_data['email_body'], training_data['labels'])

print("Best parameters set found on development set:")
print(clf2.best_params_)

preds = clf2.predict(testing_data['email_body'])
print(classification_report(testing_data['labels'], preds))

cnf2_matrix = confusion_matrix(testing_data['labels'], preds)
print(cnf2_matrix)

Best parameters set found on development set:
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 2)}
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       412
           1       0.97      1.00      0.98       489

    accuracy                           0.98       901
   macro avg       0.98      0.98      0.98       901
weighted avg       0.98      0.98      0.98       901

[[396  16]
 [  1 488]]
