In [0]:
import csv
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import random as rnd
import string
import zipfile

from bs4 import BeautifulSoup
from google.colab import files
from io import StringIO
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, isdir, join

In [0]:
def get_file_paths(directory_path):
    """
    """

    if not isdir(directory_path):
        raise Exception("Given path does not refer to a valid directory.")

    file_paths = [join(directory_path, f) for f in listdir(
        directory_path) if isfile(join(directory_path, f))]

    return file_paths

In [0]:
def get_file_content(file_path):
    """
    """

    with open(file_path, "r", encoding="ISO-8859-1") as f:
        return f.read()

In [0]:
FILE_ENCODING = "utf-8"

def set_file_content(file_path, content, append=False):
    """
    """

    mode = "a" if append else "w"

    with open(file_path, mode, encoding=FILE_ENCODING) as f:
        f.write(content)

In [0]:
def export_to_csv(content):
    """
    """

    text_stream = StringIO()

    csv_writer = csv.writer(text_stream)

    csv_writer.writerows(content)

    text_stream.seek(0)

    return text_stream

In [0]:
def clean_text(content):
  """
  """

  if "<html>" in content.lower():
    content = BeautifulSoup(content).text

  content = "".join([character if character not in string.punctuation else " " for character in content])

  content = [word for word in content.split() if word.lower() not in stopwords.words("english")]

  return content

In [7]:
uploaded_files = files.upload()

Saving Lot-01.zip to Lot-01.zip


In [0]:
zf = zipfile.ZipFile("Lot-01.zip")
zf.extractall()

In [0]:
clean_emails_file_paths = get_file_paths("Lot-01/Clean")
spam_emails_file_paths = get_file_paths("Lot-01/Spam")

emails = []

emails += [[get_file_content(file_path), 0] for file_path in clean_emails_file_paths]
emails += [[get_file_content(file_path), 1] for file_path in spam_emails_file_paths]

rnd.shuffle(emails)

emails.insert(0, ["content", "spam"])

emails_csv = export_to_csv(emails)

data_frame = pd.read_csv(emails_csv)

In [10]:
data_frame.head(5)

Unnamed: 0,content,spam
0,Subject:=?GB2312?B?mEmE1bf+hNU=?= u\nPCFET0NUW...,1
1,"Subject:TRULY SCARY: Is FEMA Storing 500,000+ ...",1
2,Subject:Try America's #1 Fat Burning Energy Dr...,1
3,Subject:Re: [geeks] Good computer fiction book...,0
4,Subject:[CT Birds] Ellington rufous\nCgoxMC8yN...,0


In [11]:
data_frame.shape

(6231, 2)

In [12]:
data_frame.columns

Index(['content', 'spam'], dtype='object')

In [13]:
data_frame.drop_duplicates(inplace=True)
data_frame.shape

(6229, 2)

In [14]:
data_frame.isnull().sum()

content    0
spam       0
dtype: int64

In [15]:
RESOURCES_DIRECTORY_PATH = ".resources"

if not os.path.exists(RESOURCES_DIRECTORY_PATH):
  os.mkdir(RESOURCES_DIRECTORY_PATH)

nltk.download("stopwords", download_dir=RESOURCES_DIRECTORY_PATH)
nltk.data.path.append(RESOURCES_DIRECTORY_PATH)

[nltk_data] Downloading package stopwords to .resources...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
data_frame["content"].head().apply(clean_text)

0    [Subject, GB2312, B, mEmE1bf, hNU, u, PCFET0NU...
1    [Subject, TRULY, SCARY, FEMA, Storing, 500, 00...
2    [Subject, Try, America, 1, Fat, Burning, Energ...
3    [Subject, geeks, Good, computer, fiction, book...
4    [Subject, CT, Birds, Ellington, rufous, CgoxMC...
Name: content, dtype: object

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer=clean_text)
bag_of_words = vectorizer.fit_transform(data_frame["content"])

In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(bag_of_words, data_frame["spam"], test_size=0.0001, random_state=0)

In [19]:
bag_of_words.shape

(6229, 210760)

In [20]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

classifier.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
print(classifier.predict(x_train))
print(y_train.values)

[1 1 1 ... 0 0 1]
[1 1 1 ... 0 0 1]


In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

prediction = classifier.predict(x_train)

print("Classification Report\n")
print(classification_report(y_train, prediction))
print()

print("Confusion Matrix\n")
print(confusion_matrix(y_train, prediction))
print()

print("Accuracy Score\n")
print(accuracy_score(y_train, prediction))
print()

Classification Report

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2879
           1       0.99      0.99      0.99      3349

    accuracy                           0.99      6228
   macro avg       0.99      0.99      0.99      6228
weighted avg       0.99      0.99      0.99      6228


Confusion Matrix

[[2851   28]
 [  37 3312]]

Accuracy Score

0.98956326268465



In [23]:
print(classifier.predict(x_test))
print(y_test.values)

[0]
[0]


In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

prediction = classifier.predict(x_test)

print("Classification Report\n")
print(classification_report(y_test, prediction))
print()

print("Confusion Matrix\n")
print(confusion_matrix(y_test, prediction))
print()

print("Accuracy Score\n")
print(accuracy_score(y_test, prediction))
print()

Classification Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1


Confusion Matrix

[[1]]

Accuracy Score

1.0



In [0]:
KNOWLEDGE_DIRECTORY_PATH = ".knowledge"

if not os.path.exists(KNOWLEDGE_DIRECTORY_PATH):
  os.mkdir(KNOWLEDGE_DIRECTORY_PATH)

with open(os.path.join(KNOWLEDGE_DIRECTORY_PATH, "classifier_naive_bayes_v01"), "wb") as f:
  pickle.dump(classifier, f)

with open(os.path.join(KNOWLEDGE_DIRECTORY_PATH, "count_vectorizer_v01"), "wb") as f:
  pickle.dump(vectorizer, f)

In [26]:
print("===")

===


In [30]:
uploaded_files = files.upload()

Saving Lot-02.zip to Lot-02.zip


In [0]:
zf = zipfile.ZipFile("Lot-02.zip")
zf.extractall()

In [0]:
test_emails_file_paths = get_file_paths("Lot-02/Test")

emails = []

emails += [[get_file_content(file_path)] for file_path in test_emails_file_paths]

emails.insert(0, ["content"])

emails_csv = export_to_csv(emails)

data_frame = pd.read_csv(emails_csv)

In [33]:
RESOURCES_DIRECTORY_PATH = ".resources"

if not os.path.exists(RESOURCES_DIRECTORY_PATH):
  os.mkdir(RESOURCES_DIRECTORY_PATH)

nltk.download("stopwords", download_dir=RESOURCES_DIRECTORY_PATH)
nltk.data.path.append(RESOURCES_DIRECTORY_PATH)

[nltk_data] Downloading package stopwords to .resources...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
from sklearn.naive_bayes import MultinomialNB

with open(os.path.join(KNOWLEDGE_DIRECTORY_PATH, "classifier_naive_bayes_v01"), "rb") as f:
  classifier = pickle.load(f)

with open(os.path.join(KNOWLEDGE_DIRECTORY_PATH, "count_vectorizer_v01"), "rb") as f:
  vectorizer = pickle.load(f)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = vectorizer.transform(data_frame["content"])

In [0]:
prediction = classifier.predict(bag_of_words)

In [0]:
CLEAN_KEYWORD = "cln"
INFECTED_KEYWORD = "inf"

LABEL_TRANSLATOR = {
    0: CLEAN_KEYWORD,
    1: INFECTED_KEYWORD
}

for i in range(len(test_emails_file_paths)):

  status = LABEL_TRANSLATOR[prediction[i]]
  file_name = os.path.basename(test_emails_file_paths[i])

  status_line = "%s|%s\n" % (file_name, status)

  set_file_content("output_file", status_line, append=True)