In [92]:
import os
import glob
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction, model_selection, naive_bayes, metrics
%matplotlib inline

In [8]:
data_dir = os.path.join("datasets", "spam_emails")
spam_dir = os.path.join(data_dir, "spam")
hardham_dir = os.path.join(data_dir, "hard_ham")
easyham_dir = os.path.join(data_dir, "easy_ham")

In [64]:
acc = {
    "spam": {
        "datadirs": [spam_dir],
        "text_accumulator": [],
        "response": 1
    },
    "ham": {
        "datadirs": [hardham_dir, easyham_dir],
        "text_accumulator": [],
        "response": 0
    }
}
undecoded = {"spam": [], "ham": []}

for datatype, container in acc.items():
    for ddir in container["datadirs"]:
        filenames = glob.glob(os.path.join(ddir, "*"))
        for i, filename in enumerate(filenames):
            try:
                with open(filename, "r", encoding="ascii", errors="surrogateescape") as f_obj:
                    lines = f_obj.readlines()
                    txt = ''.join(lines)
                    txt = ' '.join(txt.split())
            except UnicodeDecodeError:
                try:
                    with open(filename, "rb") as f_obj:
                        lines = f_obj.readlines()
                        lines = [s.decode('windows-1252') for s in lines]
                        txt = ''.join(lines)
                        txt = ' '.join(txt.split())
                except Exception as e:
                    undecoded[datatype].append(i)
            container["text_accumulator"].append(txt)

In [68]:
all_emails = acc["spam"]["text_accumulator"] + acc["ham"]["text_accumulator"]
targets = [1] * len(acc["spam"]["text_accumulator"]) + [0] * len(acc["ham"]["text_accumulator"])

In [112]:
test_size = 0.2

stratshuf = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_size)
train_ix, test_ix = next(stratshuf.split(all_emails, targets))

X_train = [all_emails[ix] for ix in train_ix]
y_train = [targets[ix] for ix in train_ix]
X_test = [all_emails[ix] for ix in test_ix]
y_test = [targets[ix] for ix in test_ix]

vec = feature_extraction.text.CountVectorizer()
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

y_train = np.array(y_train)
y_test = np.array(y_test)

In [130]:
nb_clf = naive_bayes.BernoulliNB()
nb_clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [133]:
y_pred = nb_clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy is {round(accuracy*100, 2)}%")

Accuracy is 87.59%
