## Implementing a Neural Network

**(Run this cell to define useful Latex macros)**
\\[
\newcommand{\bigoh}[1]{\mathcal{O}\left(#1\right)}
\newcommand{\card}[1]{\left\lvert#1\right\rvert}
\newcommand{\condbar}[0]{\,\big|\,}
\newcommand{\eprob}[1]{\widehat{\text{Pr}}\left[#1\right]}
\newcommand{\norm}[1]{\left\lvert\left\lvert#1\right\rvert\right\rvert}
\newcommand{\prob}[1]{\text{Pr}\left[#1\right]}
\newcommand{\pprob}[2]{\text{Pr}_{#1}\left[#2\right]}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\trans}[0]{^\intercal}
\newcommand{\fpartial}[2]{\frac{\partial #1}{\partial #2}}
\\]

Below is just our standard code to load the Enron email dataset.

In [1]:
import os
import os.path

DATA_DIR = os.path.join(
    os.getcwd(),
    "data/"
)


In [2]:
import os

# Simple email class. All it does is allow you to read an email's text.
class Email:
    def __init__(self, path, label):
        self.path = path
        self.label = label

    def text_content(self):
        return type(self).read_text_content(self.path)
    
    def word_counts(self):
        counts = {}
        for word in self.text_content().split():
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
            
        return counts

    @classmethod
    def read(cls, path, label):
        return Email(
            path = path,
            label = label
        )

    @classmethod
    def read_text_content(cls, path):
        full_path = os.path.join(DATA_DIR, path)
        # Grr! Emails are encoded in Latin-1, not UTF-8. Python
        # (rightly) freaks out.
        with open(full_path, "r", encoding = "iso-8859-1") as f:
            try:
                return f.read()
            except:
                print(f"Error with: {path}")
                raise


In [3]:
import numpy as np
from sortedcontainers import SortedSet

# This counts how many emails each word occurs in.
def build_word_reaches(ham_emails, spam_emails):
    word_reaches = {}
    for emails in (ham_emails, spam_emails):
        for email in emails:
            for (word, _) in email.word_counts().items():
                if word not in word_reaches:
                    word_reaches[word] = 0
                # No matter how frequent in the email, only counts once per email.
                word_reaches[word] += 1
                
    return word_reaches

# Throw away those emails that don't occur in at least 100 emails.
# Throwing out low reach features means:
# (1) Less chance for overfitting
# (2) Smaller feature vectors, faster, less memory use.
def filter_words(word_reaches, limit = 100):
    filtered_words = SortedSet()
    for (word, word_reach) in word_reaches.items():
        if word_reach >= limit:
            filtered_words.add(word)

    return filtered_words

# Bidirectional map. Limits to just filtered words, though.
class FilteredWordEncodingDictionary:
    def __init__(self, filtered_words):
        self.word_to_code_dict = {}
        self.code_to_word_dict = {}

        for word in filtered_words:
            self.insert_word(word)

    # Only meant to be called when constructing the dictionary.
    def insert_word(self, word):
        if word not in self.word_to_code_dict:
            code = len(self.word_to_code_dict)
            self.word_to_code_dict[word] = code
            self.code_to_word_dict[code] = word

    def word_to_code(self, word):
        if word not in self.word_to_code_dict:
            return None

        return self.word_to_code_dict[word]

    def code_to_word(self, code):
        if code not in self.code_to_word_dict:
            raise f"Code {code} not recorded!"

        return self.code_to_word_dict[code]

    # This returns a vector of ones and zeros.
    def encode_text(self, text):
        codes = np.zeros(len(self.code_to_word_dict))

        for word in text.split():
            code = self.word_to_code(word)
            if code is not None:
                codes[code] = 1.0

        return codes
    
    def __len__(self):
        return len(self.code_to_word_dict)

In [4]:
# This is a simple subclass of Email that just encodes the words in an email.
class EncodedEmail(Email):
    def __init__(self, path, label, word_encoding_dictionary):
        super().__init__(path, label)

        self.codes = (
            word_encoding_dictionary.encode_text(
                self.text_content()
            )
        )


In [5]:
import os.path
import pickle

class Dataset:
    DATA_FILE_PATH = os.path.join(DATA_DIR, 'lr_data.p')
    WORD_REACH_LIMIT = 100

    def __init__(
            self, word_encoding_dictionary, encoded_ham_emails, encoded_spam_emails
    ):
        self.word_encoding_dictionary = word_encoding_dictionary
        self.encoded_ham_emails = encoded_ham_emails
        self.encoded_spam_emails = encoded_spam_emails
        
    def __len__(self):
        return len(self.encoded_ham_emails) + len(self.encoded_spam_emails)
    
    @classmethod
    def encode(cls, ham_emails, spam_emails):
        # Count words, select which we will keep.
        word_reaches = build_word_reaches(ham_emails, spam_emails)
        filtered_words = filter_words(word_reaches, limit = cls.WORD_REACH_LIMIT)
        
        # Assign codes to all words.
        word_encoding_dictionary = FilteredWordEncodingDictionary(filtered_words)
        
        # Encode each email as a vector of ones and zeros.
        encoded_ham_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in ham_emails
        ]
        encoded_spam_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in spam_emails
        ]
        
        # Construct the object!
        return cls(
            word_encoding_dictionary,
            encoded_ham_emails,
            encoded_spam_emails
        )

    INSTANCE = None
    @classmethod
    def get(cls):
        if not cls.INSTANCE:
            with open(cls.DATA_FILE_PATH, 'rb') as f:
                cls.INSTANCE = pickle.load(f)
        return cls.INSTANCE


In [6]:
import os
import os.path
import pickle
from urllib.request import urlretrieve

ENRON_SPAM_URL = (
    "http://csmining.org/index.php/"
    "enron-spam-datasets.html"
    "?file=tl_files/Project_Datasets/Enron-Spam%20datasets/Preprocessed"
    "/enron1.tar.tar"
)

TAR_FILE_NAME = "enron1.tar.tar"
ENRON_DATA_DIR_NAME = "enron1"

def download_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    if os.path.isfile(tarfile_path):
        print("Tarfile already downloaded!")
        return

    print("Downloading enron1.tar.tar")
    urlretrieve(ENRON_SPAM_URL, tarfile_path)
    print("Download complete!")

def extract_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    enron_data_dir = os.path.join(DATA_DIR, ENRON_DATA_DIR_NAME)
    if os.path.isdir(enron_data_dir):
        print("Tarfile already extracted!")
        return

    print("Extracting enron1.tar.tar")
    os.system(f"tar -xf {tarfile_path} -C {DATA_DIR}")
    print("Extraction complete!")

def read_emails_dir(path, label):
    emails = []
    for email_fname in os.listdir(os.path.join(DATA_DIR, path)):
        email_path = os.path.join(path, email_fname)
        email = Email.read(
            path = email_path,
            label = label
        )
        emails.append(email)

    return emails

def build_dataset():
    ham_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "ham"),
        label = 0
    )
    spam_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "spam"),
        label = 1
    )

    return Dataset.encode(
        ham_emails = ham_emails,
        spam_emails = spam_emails
    )

def save_dataset(dataset):
    with open(Dataset.DATA_FILE_PATH, "wb") as f:
        pickle.dump(dataset, f)

def build_and_save_dataset():
    if os.path.isfile(Dataset.DATA_FILE_PATH):
        print("Dataset already processed!")
        return

    print("Reading and processing emails!")
    dataset = build_dataset()
    save_dataset(dataset)
    print("Dataset created!")

download_tarfile()
extract_tarfile()
build_and_save_dataset()


Tarfile already downloaded!
Tarfile already extracted!
Dataset already processed!


In [7]:
import zlib

class DatasetSplitter:
    @classmethod
    def split(cls, dataset, ratio):
        datasetA = cls._split(dataset, ratio, 0)
        datasetB = cls._split(dataset, ratio, 1)
        return (datasetA, datasetB)

    @classmethod
    def _split(cls, dataset, ratio, mode):
        split_encoded_ham_emails, split_encoded_spam_emails = [], []
        emails_pairs = [
            (dataset.encoded_ham_emails, split_encoded_ham_emails),
            (dataset.encoded_spam_emails, split_encoded_spam_emails)
        ]

        for (emails, split_emails) in emails_pairs:
            for email in emails:
                # This is a fancy way to pseudorandomly but
                # deterministically select emails. That way we always
                # pick the same set of emails for reproducability
                # across program runs.
                h = zlib.crc32(email.path.encode())
                p = h / (2**32 - 1)
                if (mode == 0 and p < ratio) or (mode == 1 and p >= ratio):
                    split_emails.append(email)

        return Dataset(
            dataset.word_encoding_dictionary,
            encoded_ham_emails = split_encoded_ham_emails,
            encoded_spam_emails = split_encoded_spam_emails
        )


In [8]:
import random

class DatasetBatcher:
    @classmethod
    def batch(cls, d, batch_size):
        emails = d.encoded_ham_emails + d.encoded_spam_emails
        random.shuffle(emails)
        idxs = range(0, len(emails), batch_size)
        return [emails[idx:(idx + batch_size)] for idx in idxs]

In [9]:
from collections import namedtuple
import itertools

d = Dataset.get()
training_dataset, test_dataset = DatasetSplitter.split(d, 0.80)

VOCAB_SIZE = len(d.word_encoding_dictionary)
NUM_HIDDEN_UNITS = 128

THETA1 = np.random.normal(
    scale = 2 / np.sqrt(NUM_HIDDEN_UNITS + VOCAB_SIZE),
    size = (NUM_HIDDEN_UNITS, VOCAB_SIZE)
)
B1 = np.zeros(NUM_HIDDEN_UNITS)
THETA2 = np.random.normal(
    scale = 2 / np.sqrt(1 + NUM_HIDDEN_UNITS),
    size = (1, NUM_HIDDEN_UNITS)
)
B2 = np.zeros(1)

BATCH_SIZE = 128

ForwardResult = namedtuple('ForwardResult', 'x z2 a2 z3 a3')

def logistic(z):
    return 1 / (1 + np.exp(-z))

def forward(email):
    z2 = THETA1.dot(email.codes)
    z2 += B1
    a2 = logistic(z2)
    
    z3 = THETA2.dot(a2)
    z3 += B2
    a3 = logistic(z3)
    
    return ForwardResult(
        x = email.codes,
        z2 = z2,
        a2 = a2,
        z3 = z3,
        a3 = a3
    )

BackwardResult = namedtuple('BackwardResult', 'da3, dz3, d_theta2, db2, da2, dz2, d_theta1, db1')

def backward(forward_result, label):
    da3 = dxe_error(forward_result.a3, label)
    dz3 = dlogistic(forward_result.z3) * da3
    d_theta2 = np.outer(dz3, forward_result.a2)
    db2 = dz3
    da2 = np.dot(THETA2.T, dz3)
    dz2 = dlogistic(forward_result.z2) * da2
    d_theta1 = np.outer(dz2, forward_result.x)
    db1 = dz2
    
    return BackwardResult(
        da3 = da3,
        dz3 = dz3,
        d_theta2 = d_theta2,
        db2 = db2,
        da2 = da2,
        dz2 = dz2,
        d_theta1 = d_theta1,
        db1 = db1
    )

def xe_error(prob, label):
    if label == 1:
        return -np.log(prob)
    else:
        return -np.log(1 - prob)

def dxe_error(prob, label):
    if label == 1:
        return -(1 / prob)
    else:
        return (1 / (1 - prob))

def dlogistic(z):
    return logistic(z) * logistic(1 - z)

PartialsResult = namedtuple('PartialsResult', 'd_theta1, db1, d_theta2, db2')

def partials(emails):
    d_theta1 = np.zeros_like(THETA1)
    db1 = np.zeros_like(B1)
    d_theta2 = np.zeros_like(THETA2)
    db2 = np.zeros_like(B2)

    for email in emails:
        forward_result = forward(email)
        backward_result = backward(forward_result, email.label)
        d_theta1 += backward_result.d_theta1
        db1 += backward_result.db1
        d_theta2 += backward_result.d_theta2
        db2 += backward_result.db2

    return PartialsResult(
        d_theta1 = d_theta1,
        db1 = db1,
        d_theta2 = d_theta2,
        db2 = db2
    )

def train(training_dataset, test_dataset, epochs, learning_rate):
    global THETA1, B1, THETA2, B2
    
    batches = DatasetBatcher.batch(training_dataset, BATCH_SIZE)

    for epoch_idx in range(1, 1 + epochs):
        for batch in batches:
            partials_result = partials(batch)
            THETA1 -= (learning_rate / len(training_dataset)) * partials_result.d_theta1
            B1 -= (learning_rate / len(training_dataset)) * partials_result.db1
            THETA2 -= (learning_rate / len(training_dataset)) * partials_result.d_theta2
            B2 -= (learning_rate / len(training_dataset)) * partials_result.db2

        error = dataset_avg_error(test_dataset)
        recall = dataset_recall(test_dataset)

        print(
            f"{epoch_idx}: {error:0.2f} {recall:0.2f}@fpr={FPR_RATE}"
        )

def dataset_avg_error(d):
    total_error = 0.0
    for email in itertools.chain(d.encoded_ham_emails, d.encoded_spam_emails):
        forward_result = forward(email)
        total_error += xe_error(forward_result.a3[0], email.label)

    return total_error / len(d)

FPR_RATE = 0.0001
def dataset_recall(d):
    ham_scores = [forward(email).a3 for email in d.encoded_ham_emails]
    ham_scores.sort()
    spam_scores = [forward(email).a3 for email in d.encoded_spam_emails]
    
    cutoff = ham_scores[int(FPR_RATE * len(ham_scores))]
    num_spam_identified = np.sum([
        1 if score > cutoff else 0 for score in spam_scores
    ])
    
    return num_spam_identified / len(d.encoded_spam_emails)

train(training_dataset, test_dataset, 100, 10.00)

1: 0.30 1.00@fpr=0.0001
2: 0.19 1.00@fpr=0.0001
3: 0.15 1.00@fpr=0.0001
4: 0.12 1.00@fpr=0.0001
5: 0.11 1.00@fpr=0.0001


KeyboardInterrupt: 

In [11]:
import keras.backend as K
import keras.callbacks
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import random

training_dataset, validation_dataset = DatasetSplitter.split(d, 0.80)
training_emails = training_dataset.encoded_ham_emails + training_dataset.encoded_spam_emails
random.shuffle(training_emails)

training_X = np.zeros(shape = (len(training_emails), VOCAB_SIZE))
training_y = np.zeros(shape = len(training_emails))
for idx, email in enumerate(training_emails):
    training_X[idx, :] = email.codes
    training_y[idx] = email.label

validation_emails = validation_dataset.encoded_ham_emails + validation_dataset.encoded_spam_emails
random.shuffle(validation_emails)
validation_X = np.zeros(shape = (len(validation_emails), VOCAB_SIZE))
validation_y = np.zeros(shape = len(validation_emails))
for idx, email in enumerate(validation_emails):
    validation_X[idx, :] = email.codes
    validation_y[idx] = email.label

model = Sequential()
model.add(Dense(
    128,
    activation = 'sigmoid',
    input_shape = (VOCAB_SIZE,)
))
model.add(Dense(
    1,
    activation = 'sigmoid'
))

FPR_RATE = 0.0001
class RecallRate(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        scores = self.model.predict(self.validation_data[0])
        predictions = np.round(scores)
        targets = self.validation_data[1]

        ham_scores = scores[np.logical_not(targets)]
        cutoff = ham_scores[int(FPR_RATE * len(ham_scores))]
        
        num_true_positives = np.sum(
            scores[targets.astype(np.bool)] > cutoff
        )
        recall = num_true_positives / np.sum(targets)
        
        print(f">> FPR: {FPR_RATE}")
        print(f">> Recall: {recall:0.2f}")

def enhanced_crossentropy(true_y, pred_y):
    true_bits = K.sum(
        -K.log(
            pred_y * true_y
            + (1 - true_y)
        )
    )
    false_bits = K.sum(
        -K.log(
            (1 - pred_y) * (1 - true_y)
            + true_y
        )
    )

    # Weight true misaccuracy much more heavily
    return true_bits + 4 * false_bits

model.compile(
    loss = 'binary_crossentropy',
    optimizer = SGD(),
    metrics = ['accuracy'],
)

model.fit(
    training_X,
    training_y,
    epochs = 100,
    batch_size = 128,
    validation_data = (validation_X, validation_y),
    callbacks = [RecallRate()],
    verbose = 2
)

Train on 4181 samples, validate on 991 samples
Epoch 1/100
>> FPR: 0.0001
>> Recall: 0.14
 - 1s - loss: 0.6618 - acc: 0.6171 - val_loss: 0.5774 - val_acc: 0.7296
Epoch 2/100
>> FPR: 0.0001
>> Recall: 0.26
 - 0s - loss: 0.5936 - acc: 0.7053 - val_loss: 0.5683 - val_acc: 0.7296
Epoch 3/100
>> FPR: 0.0001
>> Recall: 0.37
 - 0s - loss: 0.5866 - acc: 0.7053 - val_loss: 0.5613 - val_acc: 0.7296
Epoch 4/100
>> FPR: 0.0001
>> Recall: 0.46
 - 0s - loss: 0.5797 - acc: 0.7053 - val_loss: 0.5555 - val_acc: 0.7296
Epoch 5/100
>> FPR: 0.0001
>> Recall: 0.59
 - 0s - loss: 0.5732 - acc: 0.7053 - val_loss: 0.5488 - val_acc: 0.7296
Epoch 6/100
>> FPR: 0.0001
>> Recall: 0.67
 - 0s - loss: 0.5667 - acc: 0.7053 - val_loss: 0.5427 - val_acc: 0.7296
Epoch 7/100
>> FPR: 0.0001
>> Recall: 0.73
 - 0s - loss: 0.5600 - acc: 0.7053 - val_loss: 0.5370 - val_acc: 0.7296
Epoch 8/100
>> FPR: 0.0001
>> Recall: 0.79
 - 0s - loss: 0.5538 - acc: 0.7053 - val_loss: 0.5317 - val_acc: 0.7296
Epoch 9/100
>> FPR: 0.0001
>> Rec

KeyboardInterrupt: 