In [0]:
import csv
import nltk
import numpy as np
import pandas as pd
import random as rnd
import string
import zipfile

from bs4 import BeautifulSoup
from google.colab import files
from io import StringIO
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, isdir, join

In [0]:
def get_file_paths(directory_path):
    """
    """

    if not isdir(directory_path):
        raise Exception("Given path does not refer to a valid directory.")

    file_paths = [join(directory_path, f) for f in listdir(
        directory_path) if isfile(join(directory_path, f))]

    return file_paths

In [0]:
def get_file_content(file_path):
    """
    """

    with open(file_path, "r", encoding="ISO-8859-1") as f:
        return f.read()

In [0]:
def export_to_csv(content):
    """
    """

    text_stream = StringIO()

    csv_writer = csv.writer(text_stream)

    csv_writer.writerows(content)

    text_stream.seek(0)

    return text_stream

In [5]:
uploaded_files = files.upload()

Saving Lot-01.zip to Lot-01.zip


In [0]:
zf = zipfile.ZipFile("Lot-01.zip")
zf.extractall()

In [0]:
clean_emails_file_paths = get_file_paths("Lot-01/Clean")
spam_emails_file_paths = get_file_paths("Lot-01/Spam")

emails = []

emails += [[get_file_content(file_path), 0] for file_path in clean_emails_file_paths]
emails += [[get_file_content(file_path), 1] for file_path in spam_emails_file_paths]

rnd.shuffle(emails)

emails.insert(0, ["content", "spam"])

emails_csv = export_to_csv(emails)

data_frame = pd.read_csv(emails_csv)

In [8]:
data_frame.head(5)

Unnamed: 0,content,spam
0,Subject:\t=?ISO-8859-1?Q?Re=3A_C=F3mo_responde...,0
1,Subject:krimoo Blast 300 Million Free Ads Ever...,1
2,Subject:Re: Socket unix\nOu plut=C3=B4t de Con...,0
3,"Subject:Re: [Flexradio] RFI on 10 meter\nUlf,\...",0
4,Subject:=?ISO-2022-JP?B?GyRCIVozWkU3JWolNSE8JU...,1


In [9]:
data_frame.shape

(6231, 2)

In [10]:
data_frame.columns

Index(['content', 'spam'], dtype='object')

In [11]:
data_frame.drop_duplicates(inplace=True)
data_frame.shape

(6229, 2)

In [12]:
data_frame.isnull().sum()

content    0
spam       0
dtype: int64

In [13]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
def clean_text(content):
  """
  """

  if "<html>" in content.lower():
    content = BeautifulSoup(content).text

  content = "".join([character if character not in string.punctuation else " " for character in content])

  content = [word for word in content.split() if word.lower() not in stopwords.words("english")]

  return content

In [15]:
data_frame["content"].head().apply(clean_text)

0    [Subject, ISO, 8859, 1, Q, 3A, C, F3mo, respon...
1    [Subject, krimoo, Blast, 300, Million, Free, A...
2    [Subject, Socket, unix, Ou, plut, C3, B4t, de,...
3    [Subject, Flexradio, RFI, 10, meter, Ulf, curr...
4    [Subject, ISO, 2022, JP, B, GyRCIVozWkU3JWolNS...
Name: content, dtype: object

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer=clean_text).fit_transform(data_frame["content"])

In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(vectorizer, data_frame["spam"], test_size=0.20, random_state=0)

In [18]:
vectorizer.shape

(6229, 210760)

In [19]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

classifier.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
print(classifier.predict(x_train))
print(y_train.values)

[1 1 1 ... 0 1 0]
[1 1 1 ... 0 1 0]


In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

prediction = classifier.predict(x_train)

print("Classification Report\n")
print(classification_report(y_train, prediction))
print()

print("Confusion Matrix\n")
print(confusion_matrix(y_train, prediction))
print()

print("Accuracy Score\n")
print(accuracy_score(y_train, prediction))
print()

Classification Report

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2333
           1       0.99      0.99      0.99      2650

    accuracy                           0.99      4983
   macro avg       0.99      0.99      0.99      4983
weighted avg       0.99      0.99      0.99      4983


Confusion Matrix

[[2309   24]
 [  35 2615]]

Accuracy Score

0.9881597431266306



In [22]:
print(classifier.predict(x_test))
print(y_test.values)

[0 1 0 ... 0 0 0]
[0 1 0 ... 0 0 0]


In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

prediction = classifier.predict(x_test)

print("Classification Report\n")
print(classification_report(y_test, prediction))
print()

print("Confusion Matrix\n")
print(confusion_matrix(y_test, prediction))
print()

print("Accuracy Score\n")
print(accuracy_score(y_test, prediction))
print()

Classification Report

              precision    recall  f1-score   support

           0       0.97      0.91      0.94       547
           1       0.93      0.98      0.95       699

    accuracy                           0.95      1246
   macro avg       0.95      0.94      0.94      1246
weighted avg       0.95      0.95      0.95      1246


Confusion Matrix

[[496  51]
 [ 17 682]]

Accuracy Score

0.9454253611556982

