In [1]:
import os
import os.path

DATA_DIR = os.path.join(
    os.getcwd(),
    "data/"
)


In [2]:
import os

# Simple email class. All it does is allow you to read an email's text.
class Email:
    def __init__(self, path, label):
        self.path = path
        self.label = label

    def text_content(self):
        return type(self).read_text_content(self.path)
    
    def word_counts(self):
        counts = {}
        for word in self.text_content().split():
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
            
        return counts

    @classmethod
    def read(cls, path, label):
        return Email(
            path = path,
            label = label
        )

    @classmethod
    def read_text_content(cls, path):
        full_path = os.path.join(DATA_DIR, path)
        # Grr! Emails are encoded in Latin-1, not UTF-8. Python
        # (rightly) freaks out.
        with open(full_path, "r", encoding = "iso-8859-1") as f:
            try:
                return f.read()
            except:
                print(f"Error with: {path}")
                raise


In [3]:
import numpy as np
from sortedcontainers import SortedSet

# This counts how many emails each word occurs in.
def build_word_reaches(ham_emails, spam_emails):
    word_reaches = {}
    for emails in (ham_emails, spam_emails):
        for email in emails:
            for (word, _) in email.word_counts().items():
                if word not in word_reaches:
                    word_reaches[word] = 0
                # No matter how frequent in the email, only counts once per email.
                word_reaches[word] += 1
                
    return word_reaches

# Throw away those emails that don't occur in at least 100 emails.
# Throwing out low reach features means:
# (1) Less chance for overfitting
# (2) Smaller feature vectors, faster, less memory use.
def filter_words(word_reaches, limit = 100):
    filtered_words = SortedSet()
    for (word, word_reach) in word_reaches.items():
        if word_reach >= limit:
            filtered_words.add(word)

    return filtered_words

# Bidirectional map. Limits to just filtered words, though.
class FilteredWordEncodingDictionary:
    def __init__(self, filtered_words):
        self.word_to_code_dict = {}
        self.code_to_word_dict = {}

        for word in filtered_words:
            self.insert_word(word)

    # Only meant to be called when constructing the dictionary.
    def insert_word(self, word):
        if word not in self.word_to_code_dict:
            code = len(self.word_to_code_dict)
            self.word_to_code_dict[word] = code
            self.code_to_word_dict[code] = word

    def word_to_code(self, word):
        if word not in self.word_to_code_dict:
            return None

        return self.word_to_code_dict[word]

    def code_to_word(self, code):
        if code not in self.code_to_word_dict:
            raise f"Code {code} not recorded!"

        return self.code_to_word_dict[code]

    # This returns a vector of ones and zeros.
    def encode_text(self, text):
        codes = np.zeros(len(self.code_to_word_dict))

        for word in text.split():
            code = self.word_to_code(word)
            if code is not None:
                codes[code] = 1.0

        return codes
    
    def __len__(self):
        return len(self.code_to_word_dict)

In [4]:
# This is a simple subclass of Email that just encodes the words in an email.
class EncodedEmail(Email):
    def __init__(self, path, label, word_encoding_dictionary):
        super().__init__(path, label)

        self.codes = (
            word_encoding_dictionary.encode_text(
                self.text_content()
            )
        )

In [5]:
import os.path
import pickle

class Dataset:
    DATA_FILE_PATH = os.path.join(DATA_DIR, 'lr_data.p')
    WORD_REACH_LIMIT = 100

    def __init__(
            self, word_encoding_dictionary, encoded_ham_emails, encoded_spam_emails
    ):
        self.word_encoding_dictionary = word_encoding_dictionary
        self.encoded_ham_emails = encoded_ham_emails
        self.encoded_spam_emails = encoded_spam_emails
        
    def __len__(self):
        return len(self.encoded_ham_emails) + len(self.encoded_spam_emails)
    
    @classmethod
    def encode(cls, ham_emails, spam_emails):
        # Count words, select which we will keep.
        word_reaches = build_word_reaches(ham_emails, spam_emails)
        filtered_words = filter_words(word_reaches, limit = cls.WORD_REACH_LIMIT)
        
        # Assign codes to all words.
        word_encoding_dictionary = FilteredWordEncodingDictionary(filtered_words)
        
        # Encode each email as a vector of ones and zeros.
        encoded_ham_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in ham_emails
        ]
        encoded_spam_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in spam_emails
        ]
        
        # Construct the object!
        return cls(
            word_encoding_dictionary,
            encoded_ham_emails,
            encoded_spam_emails
        )

    INSTANCE = None
    @classmethod
    def get(cls):
        if not cls.INSTANCE:
            with open(cls.DATA_FILE_PATH, 'rb') as f:
                cls.INSTANCE = pickle.load(f)
        return cls.INSTANCE


In [6]:
import os
import os.path
import pickle
from urllib.request import urlretrieve

ENRON_SPAM_URL = (
    "http://csmining.org/index.php/"
    "enron-spam-datasets.html"
    "?file=tl_files/Project_Datasets/Enron-Spam%20datasets/Preprocessed"
    "/enron1.tar.tar"
)

TAR_FILE_NAME = "enron1.tar.tar"
ENRON_DATA_DIR_NAME = "enron1"

def download_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    if os.path.isfile(tarfile_path):
        print("Tarfile already downloaded!")
        return

    print("Downloading enron1.tar.tar")
    urlretrieve(ENRON_SPAM_URL, tarfile_path)
    print("Download complete!")

def extract_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    enron_data_dir = os.path.join(DATA_DIR, ENRON_DATA_DIR_NAME)
    if os.path.isdir(enron_data_dir):
        print("Tarfile already extracted!")
        return

    print("Extracting enron1.tar.tar")
    os.system(f"tar -xf {tarfile_path} -C {DATA_DIR}")
    print("Extraction complete!")

def read_emails_dir(path, label):
    emails = []
    for email_fname in os.listdir(os.path.join(DATA_DIR, path)):
        email_path = os.path.join(path, email_fname)
        email = Email.read(
            path = email_path,
            label = label
        )
        emails.append(email)

    return emails

def build_dataset():
    ham_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "ham"),
        label = 0
    )
    spam_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "spam"),
        label = 1
    )

    return Dataset.encode(
        ham_emails = ham_emails,
        spam_emails = spam_emails
    )

def save_dataset(dataset):
    with open(Dataset.DATA_FILE_PATH, "wb") as f:
        pickle.dump(dataset, f)

def build_and_save_dataset():
    if os.path.isfile(Dataset.DATA_FILE_PATH):
        print("Dataset already processed!")
        return

    print("Reading and processing emails!")
    dataset = build_dataset()
    save_dataset(dataset)
    print("Dataset created!")

download_tarfile()
extract_tarfile()
build_and_save_dataset()

Downloading enron1.tar.tar
Download complete!
Extracting enron1.tar.tar
Extraction complete!
Reading and processing emails!
Dataset created!


In [7]:
import zlib

class DatasetSplitter:
    @classmethod
    def split(cls, dataset, ratio):
        datasetA = cls._split(dataset, ratio, 0)
        datasetB = cls._split(dataset, ratio, 1)
        return (datasetA, datasetB)

    @classmethod
    def _split(cls, dataset, ratio, mode):
        split_encoded_ham_emails, split_encoded_spam_emails = [], []
        emails_pairs = [
            (dataset.encoded_ham_emails, split_encoded_ham_emails),
            (dataset.encoded_spam_emails, split_encoded_spam_emails)
        ]

        for (emails, split_emails) in emails_pairs:
            for email in emails:
                # This is a fancy way to pseudorandomly but
                # deterministically select emails. That way we always
                # pick the same set of emails for reproducability
                # across program runs.
                h = zlib.crc32(email.path.encode())
                p = h / (2**32 - 1)
                if (mode == 0 and p < ratio) or (mode == 1 and p >= ratio):
                    split_emails.append(email)

        return Dataset(
            dataset.word_encoding_dictionary,
            encoded_ham_emails = split_encoded_ham_emails,
            encoded_spam_emails = split_encoded_spam_emails
        )


In [8]:
d = Dataset.get()
training_dataset, test_dataset = DatasetSplitter.split(d, 0.80)

# Constants
BATCH_SIZE = 128 # Explained later
NUM_HIDDEN_UNITS = 128
VOCAB_SIZE = len(d.word_encoding_dictionary)

In [9]:
# Weight matrices and biases
THETA1 = np.random.normal(
    scale = np.sqrt(2 / (NUM_HIDDEN_UNITS + VOCAB_SIZE))
)
B1 = np.zeros(NUM_HIDDEN_UNITS)
THETA2 = np.random.normal(
    scale = np.sqrt(2 / (1 + NUM_HIDDEN_UNITS)),
    size = (1, NUM_HIDDEN_UNITS)
)
B2 = np.zeros(1)

In [11]:
def logistic(z):
    return 1 / (1 + np.exp(-z))

def xe_error(prob, label):
    if label == 1:
        return -np.log(prob)
    else:
        return -np.log(1 - prob)

In [13]:
# Code to perform a forward pass
from collections import namedtuple

ForwardResult = namedtuple('ForwardResult', 'x z2 a2 z3 a3')

def forward(email):
    z2 = THETA1.dot(email.codes)
    z2 += B1
    a2 = logistic(z2)
    
    z3 = THETA2.dot(a2)
    z3 += B2
    a3 = logistic(z3)
    
    return ForwardResult(
        x = email.codes,
        z2 = z2,
        a2 = a2,
        z3 = z3,
        a3 = a3
    )