# DS7333 Case Study 
##  Naive Bayes and Clustering

#### John Girard, Shijo Joseph, Douglas Yip

In [100]:
%load_ext pycodestyle_magic
%pycodestyle_on

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import email
from bs4 import BeautifulSoup as BS4

In [102]:
os.listdir(".\SpamAssassinMessages")

['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']

1:14: W605 invalid escape sequence '\S'


In [103]:
file_name = []
contents = []
types = []
labels = []
labelnames = []
message = ''

for root, dirs, files in os.walk(".\SpamAssassinMessages"):
    for name in files:
        with open(os.path.join(root, name),
                  'r', encoding='latin1') as f:
            message = ''
            try:
                x = email.message_from_file(f)
            except UnicodeDecodeError:
                print("Error in file: Unknown Error")
            if "multipart" in x.get_content_type():
                if x.is_multipart():
                    for part in x.get_payload():
                        message = message + part.as_string()
                contents.append(message)
            elif "text/plain" in x.get_content_type():
                contents.append(x.get_payload()
                                .replace("\n", " "))
            elif "text/html" in x.get_content_type():
                contents.append(BS4(x.get_payload()).get_text())
            types.append(x.get_content_type())
            if "ham" in root:
                labelnames.append('ham')
                labels.append(1)
            elif "spam" in root:
                labelnames.append('spam')
                labels.append(0)
            else:
                print("YOU HAVE A BIG PROBLEM--LABEL NOT FOUND")
            file_name.append(os.path.join(root, name))

8:36: W605 invalid escape sequence '\S'


In [104]:
types = pd.DataFrame(types)
types.shape
types.value_counts()

text/plain                     7413
text/html                      1193
multipart/alternative           326
multipart/signed                180
multipart/mixed                 179
multipart/related                56
multipart/report                  5
text/plain charset=us-ascii       1
dtype: int64

In [105]:
df_NB = pd.DataFrame()
df_NB['Filename'] = file_name
df_NB['types'] = types
df_NB['email_body'] = contents
df_NB['labelnames'] = labelnames
df_NB['labels'] = labels

In [106]:
df_NB

Unnamed: 0,Filename,types,email_body,labelnames,labels
0,.\SpamAssassinMessages\easy_ham\00001.7c53336b...,text/plain,"Date: Wed, 21 Aug 2002 10:54:46 -05...",ham,1
1,.\SpamAssassinMessages\easy_ham\00002.9c4069e2...,text/plain,"Martin A posted: Tassos Papadopoulos, the Gree...",ham,1
2,.\SpamAssassinMessages\easy_ham\00003.860e3c3c...,text/plain,Man Threatens Explosion In Moscow Thursday A...,ham,1
3,.\SpamAssassinMessages\easy_ham\00004.864220c5...,text/plain,Klez: The Virus That Won't Die Already the m...,ham,1
4,.\SpamAssassinMessages\easy_ham\00005.bf27cdea...,text/plain,"> in adding cream to spaghetti carbonara, whi...",ham,1
...,...,...,...,...,...
9348,.\SpamAssassinMessages\spam_2\01397.f75f0dd0dd...,multipart/alternative,"Content-Type: text/plain;\n\tcharset=""Windows-...",spam,0
9349,.\SpamAssassinMessages\spam_2\01398.8ca7045aae...,text/plain,"Dear Subscriber, If I could show you a way to...",spam,0
9350,.\SpamAssassinMessages\spam_2\01399.2319643317...,text/plain,****Mid-Summer Customer Appreciation SALE!****...,spam,0
9351,.\SpamAssassinMessages\spam_2\01400.b444b69845...,text/plain,ATTN:SIR/MADAN ST...,spam,0


### Naive Bayes Portion

##### Using Count Vectorizer

In [107]:
# Import Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [108]:
training_data, testing_data = \
    train_test_split(df_NB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [109]:
the_count = CountVectorizer()

Xtrain = the_count.fit_transform(training_data['email_body'])

In [110]:
Xtrain = Xtrain.toarray()

In [111]:
# Fit Naive Bayes model
nb = MultinomialNB()
nb.fit(Xtrain, training_data['labels'])

In [112]:
Xtest = the_count.transform(testing_data['email_body'])
preds = nb.predict(Xtest.toarray())

In [113]:
preds.shape

(1871,)

In [114]:
print(classification_report(testing_data['labels'], preds))

In [115]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)

##### Using TF-IDF Vectorizor

In [116]:
tfidf = TfidfVectorizer()

Xtrain = tfidf.fit_transform(training_data['email_body'])

In [117]:
Xtrain = Xtrain.toarray()

In [118]:
# Fit Naive Bayes model
nb = MultinomialNB()
nb.fit(Xtrain, training_data['labels'])

In [119]:
Xtest = tfidf.transform(testing_data['email_body'])
preds = nb.predict(Xtest.toarray())

In [120]:
preds.shape

(1871,)

In [121]:
print(classification_report(testing_data['labels'], preds))

In [122]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)