# DS7333 Case Study 
##  Naive Bayes and Clustering

#### John Girard, Shijo Joseph, Douglas Yip

In [None]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import email
from bs4 import BeautifulSoup as BS4

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

In [None]:
count = 0
dir_count = 0
for root_dir, cur_dir, files in os.walk(".\\SpamAssassinMessages"):
    dir_count = len(files)
    count += len(files)
    for names in cur_dir:
        print(names, len(os.listdir(os.path.join(root_dir, names))))
print('Total Files:', count)

In [None]:
os.listdir(".\\SpamAssassinMessages")

In [None]:
file_name = []
contents = []
types = []
labels = []
labelnames = []
message = ''

for root, dirs, files in os.walk(".\\SpamAssassinMessages"):
    for name in files:
        with open(os.path.join(root, name),
                  'r', encoding='latin1') as f:
            message = ''
            try:
                x = email.message_from_file(f)
            except UnicodeDecodeError:
                print("Error in file: Unknown Error")
            if "multipart" in x.get_content_type():
                if x.is_multipart():
                    for part in x.get_payload():
                        if "text/plain" in part.get_content_type():
                            message = message + \
                                (part.get_payload()
                                 .replace("\t", "")
                                 .replace("\n", " ")
                                 .replace("^https?://", ' ')
                                 .replace("^http?://", ' ')
                                 .replace("-", " "))
                        elif "text/html" in part.get_content_type():
                            message = message + (
                                BS4(part.get_payload())
                                .get_text()
                                .replace("\t", "")
                                .replace("^https?://", ' ')
                                .replace("^http?://", ' ')
                                .replace("\n", " ")
                                .replace("-", " "))
                contents.append(message.replace("\n", " ")
                                .replace("\t", "")
                                .replace("^https?://", ' ')
                                .replace("^http?://", ' ')
                                .replace("-", " "))
            elif "text/plain" in x.get_content_type():
                contents.append(x.get_payload()
                                .replace("\t", "")
                                .replace("\n", " ")
                                .replace("^https?://", ' ')
                                .replace("^http?://", ' ')
                                .replace("-", " "))
            elif "text/html" in x.get_content_type():
                contents.append(BS4(x.get_payload())
                                .get_text()
                                .replace("\t", "")
                                .replace("^https?://", ' ')
                                .replace("^http?://", ' ')
                                .replace("\n", " ")
                                .replace("-", " "))
            types.append(x.get_content_type())
            if "ham" in root:
                labelnames.append('ham')
                labels.append(1)
            elif "spam" in root:
                labelnames.append('spam')
                labels.append(0)
            file_name.append(os.path.join(root, name))

In [None]:
types = pd.DataFrame(types)
types.shape
types.value_counts()

In [None]:
df_NB = pd.DataFrame()
df_NB['Filename'] = file_name
df_NB['types'] = types
df_NB['email_body'] = contents
df_NB['labelnames'] = labelnames
df_NB['labels'] = labels

In [None]:
df_NB

In [None]:
the_count = CountVectorizer()
Xtrain = the_count.fit_transform(df_NB['email_body'])
Xtrain = Xtrain.toarray()

In [None]:
# calculate distortion for a range of number of cluster
distortions = []
best_distortion = 1e12
best_i = 10
for i in range(1, 30):
    km = KMeans(n_clusters=i,
                n_init='auto',
                random_state=0)
    km.fit(Xtrain)
    distortions.append(km.inertia_)
    if best_distortion > km.inertia_:
        best_distortion = km.inertia_
        best_i = i
    print(i)

In [None]:
print("Best i  found:")
print(best_i)
print("Best distortion  found:")
print(best_distortion)

In [None]:
# plot the data
plt.plot(range(1, 30), distortions, marker='o')
plt.title("Elbow Graph")
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
km = KMeans(n_clusters=9,
            n_init='auto',
            random_state=0)
clusters = km.fit_predict(Xtrain)

In [None]:
df_NB['clusters'] = clusters



In [None]:
df_NB

In [None]:
df_NB['labelnames'].value_counts()

In [None]:
df_NB[
    df_NB['types'] == 'multipart/alternative'].email_body

In [None]:
type(Xtrain)

In [None]:
Xtrain.shape

In [None]:
New_Xtrain = np.hstack((Xtrain, clusters.reshape(-1, 1)))
New_Xtrain.shape

In [None]:
cluster_array = np.array(df_NB['clusters'])
cluster_array.shape

### Naive Bayes Portion

##### Using Count Vectorizer

In [None]:
# Import Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
training_data, testing_data = \
    train_test_split(df_NB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [None]:
the_count = CountVectorizer()

Xtrain = the_count.fit_transform(training_data['email_body'])

In [None]:
Xtrain = Xtrain.toarray(ngram_range=(1, 2))
cluster_array = np.array(training_data['clusters'])
Xtrain2 = np.hstack((Xtrain, cluster_array.reshape(-1, 1)))

In [None]:
# Fit Naive Bayes model
nb = MultinomialNB(alpha=0.1)
nb.fit(Xtrain2, training_data['labels'])

In [None]:
Xtest = the_count.transform(testing_data['email_body'])
Xtest = Xtest.toarray()
cluster_array = np.array(testing_data['clusters'])
Xtest2 = np.hstack((Xtest, cluster_array.reshape(-1, 1)))
preds = nb.predict(Xtest2)

In [None]:
preds.shape

In [None]:
print(classification_report(testing_data['labels'], preds))

In [None]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cnf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

##### Using TF-IDF Vectorizor

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))

Xtrain = tfidf.fit_transform(training_data['email_body'])

In [None]:
Xtrain = Xtrain.toarray()
cluster_array = np.array(training_data['clusters'])
Xtrain2 = np.hstack((Xtrain, cluster_array.reshape(-1, 1)))

In [None]:
# Fit Naive Bayes model
nb = MultinomialNB(alpha=0.1)
nb.fit(Xtrain2, training_data['labels'])

In [None]:
Xtest = the_count.transform(testing_data['email_body'])
Xtest = Xtest.toarray()
cluster_array = np.array(testing_data['clusters'])
Xtest2 = np.hstack((Xtest, cluster_array.reshape(-1, 1)))
preds = nb.predict(Xtest2)

In [None]:
preds.shape

In [None]:
print(classification_report(testing_data['labels'], preds))

In [None]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['labels'], preds)

print(cnf_matrix)

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 1, 10],
}

clf2 = GridSearchCV(text_clf, parameters, cv=5)
clf2.fit(training_data['email_body'], training_data['labels'])

print("Best parameters set found on development set:")
print(clf2.best_params_)

preds = clf2.predict(testing_data['email_body'])
print(classification_report(testing_data['labels'], preds))

cnf2_matrix = confusion_matrix(testing_data['labels'], preds)
print(cnf2_matrix)

In [None]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 1, 10],
}

clf2 = GridSearchCV(text_clf, parameters, cv=5)
clf2.fit(training_data['email_body'], training_data['labels'])

print("Best parameters set found on development set:")
print(clf2.best_params_)

preds = clf2.predict(testing_data['email_body'])
print(classification_report(testing_data['labels'], preds))

cnf2_matrix = confusion_matrix(testing_data['labels'], preds)
print(cnf2_matrix)

In [None]:
def prediction(df):
    count = CountVectorizer()
    clf = MultinomialNB(alpha=0.1)
    training_data, testing_data = \
        train_test_split(df, test_size=0.2, random_state=25)
    Xtrain = count.fit_transform(training_data['email_body'])
    Xtrain = Xtrain.toarray()
    cluster_array = np.array(training_data['clusters'])
    Xtrain2 = np.hstack((Xtrain, cluster_array.reshape(-1, 1)))
    clf.fit(Xtrain2, training_data['labels'])
    Xtest = count.transform(testing_data['email_body'])
    Xtest = Xtest.toarray()
    cluster_array = np.array(testing_data['clusters'])
    Xtest2 = np.hstack((Xtest, cluster_array.reshape(-1, 1)))
    preds = clf.predict(Xtest2)
    cnf2_matrix = confusion_matrix(testing_data['labels'], preds)
    return print(classification_report(testing_data['labels'], preds),
                 cnf2_matrix)

In [None]:
def create_df_from_file(path):
    file_name = []
    contents = []
    types = []
    labels = []
    labelnames = []
    message = ''
    for root, dirs, files in os.walk(path):
        for name in files:
            with open(os.path.join(root, name),
                      'r', encoding='latin1') as f:
                message = ''
                try:
                    x = email.message_from_file(f)
                except UnicodeDecodeError:
                    print("Error in file: Unknown Error")
                if "multipart" in x.get_content_type():
                    if x.is_multipart():
                        for part in x.get_payload():
                            if "text/plain" in part.get_content_type():
                                message = message + \
                                    (part.get_payload()
                                     .replace("\t", "")
                                     .replace("\n", " ")
                                     .replace(r'http\S+', ' ')
                                     .replace("-", " "))
                            elif "text/html" in part.get_content_type():
                                message = message + \
                                    (BS4(part.get_payload())
                                     .get_text()
                                     .replace("\t", "")
                                     .replace(r'http\S+', ' ')
                                     .replace("\n", " ")
                                     .replace("-", " "))
                    contents.append(message.replace("\n", " ")
                                    .replace("\t", "")
                                    .replace(r'http\S+', ' ')
                                    .replace("-", " "))
                elif "text/plain" in x.get_content_type():
                    contents.append(x.get_payload()
                                    .replace("\n", " ")
                                    .replace(r'http\S+', ' ')
                                    .replace("-", " "))
                elif "text/html" in x.get_content_type():
                    contents.append(BS4(x.get_payload())
                                    .get_text()
                                    .replace(r'http\S+', ' ')
                                    .replace("\n", " ")
                                    .replace("-", " "))
                types.append(x.get_content_type())
                if "ham" in root:
                    labelnames.append('ham')
                    labels.append(1)
                elif "spam" in root:
                    labelnames.append('spam')
                    labels.append(0)
                file_name.append(os.path.join(root, name))
    df_NB = pd.DataFrame()
    df_NB['Filename'] = file_name
    df_NB['types'] = types
    df_NB['email_body'] = contents
    df_NB['labelnames'] = labelnames
    df_NB['labels'] = labels
    the_count = CountVectorizer()
    Xtrain = the_count.fit_transform(df_NB['email_body'])
    Xtrain = Xtrain.toarray()
    km = KMeans(n_clusters=9,
                n_init='auto',
                random_state=0)
    clusters = km.fit_predict(Xtrain)
    df_NB['clusters'] = clusters
    return df_NB

In [None]:
New_df = create_df_from_file(".\\SpamAssassinMessages")
New_df

In [None]:
prediction(New_df)