## Implementing a Neural Network

**(Run this cell to define useful Latex macros)**
\\[
\newcommand{\bigoh}[1]{\mathcal{O}\left(#1\right)}
\newcommand{\card}[1]{\left\lvert#1\right\rvert}
\newcommand{\condbar}[0]{\,\big|\,}
\newcommand{\eprob}[1]{\widehat{\text{Pr}}\left[#1\right]}
\newcommand{\norm}[1]{\left\lvert\left\lvert#1\right\rvert\right\rvert}
\newcommand{\prob}[1]{\text{Pr}\left[#1\right]}
\newcommand{\pprob}[2]{\text{Pr}_{#1}\left[#2\right]}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\trans}[0]{^\intercal}
\newcommand{\fpartial}[2]{\frac{\partial #1}{\partial #2}}
\\]

Below is just our standard code to load the Enron email dataset.

In [1]:
import os
import os.path

DATA_DIR = os.path.join(
    os.getcwd(),
    "data/"
)


In [2]:
import os

# Simple email class. All it does is allow you to read an email's text.
class Email:
    def __init__(self, path, label):
        self.path = path
        self.label = label

    def text_content(self):
        return type(self).read_text_content(self.path)
    
    def word_counts(self):
        counts = {}
        for word in self.text_content().split():
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
            
        return counts

    @classmethod
    def read(cls, path, label):
        return Email(
            path = path,
            label = label
        )

    @classmethod
    def read_text_content(cls, path):
        full_path = os.path.join(DATA_DIR, path)
        # Grr! Emails are encoded in Latin-1, not UTF-8. Python
        # (rightly) freaks out.
        with open(full_path, "r", encoding = "iso-8859-1") as f:
            try:
                return f.read()
            except:
                print(f"Error with: {path}")
                raise


In [3]:
import numpy as np
from sortedcontainers import SortedSet

# This counts how many emails each word occurs in.
def build_word_reaches(ham_emails, spam_emails):
    word_reaches = {}
    for emails in (ham_emails, spam_emails):
        for email in emails:
            for (word, _) in email.word_counts().items():
                if word not in word_reaches:
                    word_reaches[word] = 0
                # No matter how frequent in the email, only counts once per email.
                word_reaches[word] += 1
                
    return word_reaches

# Throw away those emails that don't occur in at least 100 emails.
# Throwing out low reach features means:
# (1) Less chance for overfitting
# (2) Smaller feature vectors, faster, less memory use.
def filter_words(word_reaches, limit = 100):
    filtered_words = SortedSet()
    for (word, word_reach) in word_reaches.items():
        if word_reach >= limit:
            filtered_words.add(word)

    return filtered_words

# Bidirectional map. Limits to just filtered words, though.
class FilteredWordEncodingDictionary:
    def __init__(self, filtered_words):
        self.word_to_code_dict = {}
        self.code_to_word_dict = {}

        for word in filtered_words:
            self.insert_word(word)

    # Only meant to be called when constructing the dictionary.
    def insert_word(self, word):
        if word not in self.word_to_code_dict:
            code = len(self.word_to_code_dict)
            self.word_to_code_dict[word] = code
            self.code_to_word_dict[code] = word

    def word_to_code(self, word):
        if word not in self.word_to_code_dict:
            return None

        return self.word_to_code_dict[word]

    def code_to_word(self, code):
        if code not in self.code_to_word_dict:
            raise f"Code {code} not recorded!"

        return self.code_to_word_dict[code]

    # This returns a vector of ones and zeros.
    def encode_text(self, text):
        codes = np.zeros(len(self.code_to_word_dict))

        for word in text.split():
            code = self.word_to_code(word)
            if code is not None:
                codes[code] = 1.0

        return codes
    
    def __len__(self):
        return len(self.code_to_word_dict)

In [4]:
# This is a simple subclass of Email that just encodes the words in an email.
class EncodedEmail(Email):
    def __init__(self, path, label, word_encoding_dictionary):
        super().__init__(path, label)

        self.codes = (
            word_encoding_dictionary.encode_text(
                self.text_content()
            )
        )


In [5]:
import os.path
import pickle

class Dataset:
    DATA_FILE_PATH = os.path.join(DATA_DIR, 'lr_data.p')
    WORD_REACH_LIMIT = 100

    def __init__(
            self, word_encoding_dictionary, encoded_ham_emails, encoded_spam_emails
    ):
        self.word_encoding_dictionary = word_encoding_dictionary
        self.encoded_ham_emails = encoded_ham_emails
        self.encoded_spam_emails = encoded_spam_emails
        
    def __len__(self):
        return len(self.encoded_ham_emails) + len(self.encoded_spam_emails)
    
    @classmethod
    def encode(cls, ham_emails, spam_emails):
        # Count words, select which we will keep.
        word_reaches = build_word_reaches(ham_emails, spam_emails)
        filtered_words = filter_words(word_reaches, limit = cls.WORD_REACH_LIMIT)
        
        # Assign codes to all words.
        word_encoding_dictionary = FilteredWordEncodingDictionary(filtered_words)
        
        # Encode each email as a vector of ones and zeros.
        encoded_ham_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in ham_emails
        ]
        encoded_spam_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in spam_emails
        ]
        
        # Construct the object!
        return cls(
            word_encoding_dictionary,
            encoded_ham_emails,
            encoded_spam_emails
        )

    INSTANCE = None
    @classmethod
    def get(cls):
        if not cls.INSTANCE:
            with open(cls.DATA_FILE_PATH, 'rb') as f:
                cls.INSTANCE = pickle.load(f)
        return cls.INSTANCE


In [6]:
import os
import os.path
import pickle
from urllib.request import urlretrieve

ENRON_SPAM_URL = (
    "http://csmining.org/index.php/"
    "enron-spam-datasets.html"
    "?file=tl_files/Project_Datasets/Enron-Spam%20datasets/Preprocessed"
    "/enron1.tar.tar"
)

TAR_FILE_NAME = "enron1.tar.tar"
ENRON_DATA_DIR_NAME = "enron1"

def download_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    if os.path.isfile(tarfile_path):
        print("Tarfile already downloaded!")
        return

    print("Downloading enron1.tar.tar")
    urlretrieve(ENRON_SPAM_URL, tarfile_path)
    print("Download complete!")

def extract_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    enron_data_dir = os.path.join(DATA_DIR, ENRON_DATA_DIR_NAME)
    if os.path.isdir(enron_data_dir):
        print("Tarfile already extracted!")
        return

    print("Extracting enron1.tar.tar")
    os.system(f"tar -xf {tarfile_path} -C {DATA_DIR}")
    print("Extraction complete!")

def read_emails_dir(path, label):
    emails = []
    for email_fname in os.listdir(os.path.join(DATA_DIR, path)):
        email_path = os.path.join(path, email_fname)
        email = Email.read(
            path = email_path,
            label = label
        )
        emails.append(email)

    return emails

def build_dataset():
    ham_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "ham"),
        label = 0
    )
    spam_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "spam"),
        label = 1
    )

    return Dataset.encode(
        ham_emails = ham_emails,
        spam_emails = spam_emails
    )

def save_dataset(dataset):
    with open(Dataset.DATA_FILE_PATH, "wb") as f:
        pickle.dump(dataset, f)

def build_and_save_dataset():
    if os.path.isfile(Dataset.DATA_FILE_PATH):
        print("Dataset already processed!")
        return

    print("Reading and processing emails!")
    dataset = build_dataset()
    save_dataset(dataset)
    print("Dataset created!")

download_tarfile()
extract_tarfile()
build_and_save_dataset()


Tarfile already downloaded!
Tarfile already extracted!
Dataset already processed!


In [7]:
import zlib

class DatasetSplitter:
    @classmethod
    def split(cls, dataset, ratio):
        datasetA = cls._split(dataset, ratio, 0)
        datasetB = cls._split(dataset, ratio, 1)
        return (datasetA, datasetB)

    @classmethod
    def _split(cls, dataset, ratio, mode):
        split_encoded_ham_emails, split_encoded_spam_emails = [], []
        emails_pairs = [
            (dataset.encoded_ham_emails, split_encoded_ham_emails),
            (dataset.encoded_spam_emails, split_encoded_spam_emails)
        ]

        for (emails, split_emails) in emails_pairs:
            for email in emails:
                # This is a fancy way to pseudorandomly but
                # deterministically select emails. That way we always
                # pick the same set of emails for reproducability
                # across program runs.
                h = zlib.crc32(email.path.encode())
                p = h / (2**32 - 1)
                if (mode == 0 and p < ratio) or (mode == 1 and p >= ratio):
                    split_emails.append(email)

        return Dataset(
            dataset.word_encoding_dictionary,
            encoded_ham_emails = split_encoded_ham_emails,
            encoded_spam_emails = split_encoded_spam_emails
        )


In [8]:
import random

class DatasetBatcher:
    @classmethod
    def batch(cls, d, batch_size):
        emails = d.encoded_ham_emails + d.encoded_spam_emails
        random.shuffle(emails)
        idxs = range(0, len(emails), batch_size)
        return [emails[idx:(idx + batch_size)] for idx in idxs]

### Begin The Neural Network!

At this point we can load the dataset. Every email is represented as a word vector and a label of whether it is spam. A position in the word vector is set to 1.0 if the corresponding word appears in the email; 0.0 if not.

Only words with a reach of >100 have been kept.

In [9]:
d = Dataset.get()
training_dataset, test_dataset = DatasetSplitter.split(d, 0.80)

# Constants
BATCH_SIZE = 128 # Explained later
NUM_HIDDEN_UNITS = 128
VOCAB_SIZE = len(d.word_encoding_dictionary)


**Parameters**

Here I have set the weight matrices to have random values. I sampled the values from the normal distribution, using a standard deviation that has been found to work well. This is called the "Glorot initialization" after the researcher who proposed it.

In [10]:
# Weight matrices and biases
THETA1 = np.random.normal(
    scale = np.sqrt(2/ (NUM_HIDDEN_UNITS + VOCAB_SIZE)),
    size = (NUM_HIDDEN_UNITS, VOCAB_SIZE)
)
B1 = np.zeros(NUM_HIDDEN_UNITS)
THETA2 = np.random.normal(
    scale = np.sqrt(2 / (1 + NUM_HIDDEN_UNITS)),
    size = (1, NUM_HIDDEN_UNITS)
)
B2 = np.zeros(1)


**Helper Functions**

Here are a number of helpful functions that I will use. I will also need their derivatives.

In [11]:
# Helper functions and their derivatives.

def xe_error(prob, label):
    if label == 1:
        return -np.log(prob)
    else:
        return -np.log(1 - prob)

def dxe_error(prob, label):
    if label == 1:
        return -(1 / prob)
    else:
        return (1 / (1 - prob))

def logistic(z):
    return 1 / (1 + np.exp(-z))

def dlogistic(z):
    return logistic(z) * (1 - logistic(z))


**Metrics**

These functions report how well we are doing. I keep track of the average cross entropy, as well as the recall rate for a 1% false positive rate.

We will train the model by minimizing the cross entropy, but ultimately we want to block emails, so it makes sense to report both.

In [12]:
# Metrics to report on performance
import itertools

def dataset_avg_xe_error(d):
    total_xe_error = 0.0
    for email in itertools.chain(d.encoded_ham_emails, d.encoded_spam_emails):
        forward_result = forward(email)
        total_xe_error += xe_error(forward_result.a3[0], email.label)

    return total_xe_error / len(d)

FPR_RATE = 0.01
def dataset_recall(d):
    ham_scores = [forward(email).a3 for email in d.encoded_ham_emails]
    ham_scores.sort()
    ham_scores.reverse()
    spam_scores = [forward(email).a3 for email in d.encoded_spam_emails]
    
    cutoff = ham_scores[int(FPR_RATE * len(ham_scores))]
    num_false_positives = np.sum([
        1 if score > cutoff else 0 for score in ham_scores
    ])
    num_true_positives = np.sum([
        1 if score > cutoff else 0 for score in spam_scores
    ])
    
    return (
        num_false_positives / len(d.encoded_ham_emails),
        num_true_positives / len(d.encoded_spam_emails)
    )


**Forward Pass Calculation**

This performs the forward pass by:

1. Multiplying the input by the first weight matrix. The bias is added. This is $z^2$.
2. Calculating $a^2 = \sigma\left(z^2\right)$.
3. Next, calculating $z^3$ by multiplying by the next weight matrix and addingn the bias.
4. Calculating $a^3$ by applying the logistic function to $z^2$.

In [13]:
# Code to perform a forward pass
from collections import namedtuple

ForwardResult = namedtuple('ForwardResult', 'x z2 a2 z3 a3')

def forward(email):
    z2 = THETA1.dot(email.codes)
    z2 += B1
    a2 = logistic(z2)
    
    z3 = THETA2.dot(a2)
    z3 += B2
    a3 = logistic(z3)
    
    return ForwardResult(
        x = email.codes,
        z2 = z2,
        a2 = a2,
        z3 = z3,
        a3 = a3
    )


**Backward Pass Calculation**

This pass calculates all the partial derivatives we will need. I must first perform a forward pass to calculate needed quantities.

1. First I calculate $\fpartial{E}{a^3}$, which just involves the derivative of the cross-entropy function, since $a^3$ is a direct input into the error function.
2. To calculate $\fpartial{E}{z^3}$, I use $\fpartial{E}{z^3} = \fpartial{E}{a^3}\fpartial{a^3}{z^3}$. Since I know $a^3 = \sigma\left(z^3\right)$, I know $\fpartial{a^3}{z^3} = \sigma\left(z^3\right)\left(1 - \sigma\left(z^3\right)\right)$.
3. To calculate $\fpartial{E}{\Theta^2_{1, j}}$, I know that this equals $\fpartial{E}{z^3}\fpartial{z^3}{\Theta^2_{1, j}}$. I know $z_1^3 = b^2 + \sum_j \Theta^2_{1, j} a_j^2$. Therefore $\fpartial{z^3}{\Theta^2_{1, j}} = a_j^2$.
    * Since for every $j$ we have $\fpartial{E}{\Theta^2_{1, j}} = \fpartial{E}{z^3}a_j^2$, we may vectorize this operation.
    * Thus, I have written $\fpartial{E}{\Theta^2} = \fpartial{E}{z^3} a^2$.
    * Actually, I wrote this in terms of the outer product, but since `dz3.shape = (1,)`, this is the same.
    * The outer product will be useful in the next level of backpropagation.
4. To calculate $\fpartial{E}{b^2}$, this is $\fpartial{E}{z^3}\fpartial{z^3}{b^2}$. Since $z_1^3 = b^2 + \sum_j \Theta^2_{1, j} a_j^2$, we know $\fpartial{z^3}{b^2} = 1$.
5. To calculate $\fpartial{E}{a_j^2}$, we break it into $\fpartial{E}{z^3}\fpartial{z^3}{a_j^2}$. We note that $z^3 = b^2 + \sum_j \Theta_{1, j}a_j^2$. Thus this partial is $\Theta_{1, j}$.
    * Since $\fpartial{E}{a_j^2} = \fpartial{E}{z^3} \Theta_{1, j}$, we vectorize this operation.
    * I wrote this as $\Theta^{2\intercal} \fpartial{E}{z^3}$. This would have been useful if there were more than one unit in $z^3$.
6. To calculate $\fpartial{E}{z^2}$, I just multiply $\fpartial{E}{a^2}$ by the derivative of the logistic function at $z^2$. This is the same as above.
7. **TODO**: theta1...
8. To calculate $\fpartial{E}{b^1} = \fpartial{E}{z^2} \circ \fpartial{z^2}{b^1}$. The first part is a vector. The second part is also a vector: of all ones. Why is $\fpartial{z_i^2}{b_i^1}$ always equal to 1.0? I explained why for $b^2$ above.

In [14]:
# Code to perform a backward pass
from collections import namedtuple

BackwardResult = namedtuple('BackwardResult', 'da3, dz3, d_theta2, db2, da2, dz2, d_theta1, db1')

def backward(forward_result, label):
    da3 = dxe_error(forward_result.a3, label)
    dz3 = dlogistic(forward_result.z3) * da3
    d_theta2 = np.outer(dz3, forward_result.a2)
    db2 = dz3
    da2 = np.dot(THETA2.T, dz3)
    dz2 = dlogistic(forward_result.z2) * da2
    d_theta1 = np.outer(dz2, forward_result.x)
    db1 = dz2
    
    return BackwardResult(
        da3 = da3,
        dz3 = dz3,
        d_theta2 = d_theta2,
        db2 = db2,
        da2 = da2,
        dz2 = dz2,
        d_theta1 = d_theta1,
        db1 = db1
    )


**Accumulate Partial Derivatives**

The `backward` function just works on a single example. To do gradient descent, we need to calculate the total partial derivative over a batch of examples. Luckily, that just involves summing up the partial derivatives calculated for each example.

In [15]:
# Code to collect the sum of partial values across a batch of examples
from collections import namedtuple

PartialsResult = namedtuple('PartialsResult', 'd_theta1, db1, d_theta2, db2')

def partials(emails):
    d_theta1 = np.zeros_like(THETA1)
    db1 = np.zeros_like(B1)
    d_theta2 = np.zeros_like(THETA2)
    db2 = np.zeros_like(B2)

    for email in emails:
        forward_result = forward(email)
        backward_result = backward(forward_result, email.label)
        d_theta1 += backward_result.d_theta1
        db1 += backward_result.db1
        d_theta2 += backward_result.d_theta2
        db2 += backward_result.db2

    return PartialsResult(
        d_theta1 = d_theta1,
        db1 = db1,
        d_theta2 = d_theta2,
        db2 = db2
    )


**Training!**

Finally we can train the model! We'll proceed in batches of 128 emails. We'll run for 10 *epochs*. An *epoch* is a pass through the entire dataset.

Each pass is made up of many batches. We make an update after each batch. This lets us make more small updates per epoch, which leads to quicker convergence.

The downside is that the partial derivative calculated over a batch isn't exactly the same as the partial derivative calculated over the entire dataset. The batch partial derivative is a *noisy estimate* of the true partial derivative over the entire dataset.

Still, making many slightly inaccurate steps is in practice better than making very few accurate updates. Since it takes a long time to run forward and backward passes on every email, if you only update once per epoch, it will take a lot more time to make as many updates as you would if you ran many batches per epoch.

In this case, perfection is the enemy of the good.

In [16]:
def train(training_dataset, test_dataset, epochs, learning_rate):
    global THETA1, B1, THETA2, B2
    
    batches = DatasetBatcher.batch(training_dataset, BATCH_SIZE)

    for epoch_idx in range(1, 1 + epochs):
        for batch in batches:
            partials_result = partials(batch)
            THETA1 -= (learning_rate / len(training_dataset)) * partials_result.d_theta1
            B1 -= (learning_rate / len(training_dataset)) * partials_result.db1
            THETA2 -= (learning_rate / len(training_dataset)) * partials_result.d_theta2
            B2 -= (learning_rate / len(training_dataset)) * partials_result.db2

        xe_error = dataset_avg_xe_error(test_dataset)
        fpr, recall = dataset_recall(test_dataset)

        print(
            f"{epoch_idx}: {xe_error:0.2f} {recall:0.2f}@fpr={fpr:0.4f}"
        )

# NB: 10.0 is a weirdly high learning rate...
train(training_dataset, test_dataset, epochs = 10, learning_rate = 10.00)

1: 0.41 0.40@fpr=0.0097
2: 0.27 0.59@fpr=0.0097
3: 0.21 0.69@fpr=0.0097
4: 0.17 0.74@fpr=0.0097
5: 0.15 0.75@fpr=0.0097
6: 0.14 0.81@fpr=0.0097
7: 0.12 0.82@fpr=0.0097
8: 0.12 0.82@fpr=0.0097
9: 0.11 0.82@fpr=0.0097
10: 0.10 0.85@fpr=0.0097


In [17]:
import keras.backend as K
import keras.callbacks
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import random

training_dataset, validation_dataset = DatasetSplitter.split(d, 0.80)
training_emails = training_dataset.encoded_ham_emails + training_dataset.encoded_spam_emails
random.shuffle(training_emails)

training_X = np.zeros(shape = (len(training_emails), VOCAB_SIZE))
training_y = np.zeros(shape = len(training_emails))
for idx, email in enumerate(training_emails):
    training_X[idx, :] = email.codes
    training_y[idx] = email.label

validation_emails = validation_dataset.encoded_ham_emails + validation_dataset.encoded_spam_emails
random.shuffle(validation_emails)
validation_X = np.zeros(shape = (len(validation_emails), VOCAB_SIZE))
validation_y = np.zeros(shape = len(validation_emails))
for idx, email in enumerate(validation_emails):
    validation_X[idx, :] = email.codes
    validation_y[idx] = email.label

model = Sequential()
model.add(Dense(
    128,
    activation = 'sigmoid',
    input_shape = (VOCAB_SIZE,)
))
model.add(Dense(
    1,
    activation = 'sigmoid'
))

FPR_RATE = 0.01
class RecallRate(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        scores = self.model.predict(self.validation_data[0])
        targets = self.validation_data[1]

        ham_scores = scores[np.logical_not(targets)]
        ham_scores = np.sort(ham_scores)[::-1]
        cutoff = ham_scores[int(FPR_RATE * len(ham_scores))]
        
        num_false_positives = np.sum(
            scores[np.logical_not(targets)] > cutoff
        )
        num_true_positives = np.sum(
            scores[targets.astype(np.bool)] > cutoff
        )
        fpr = num_false_positives / np.sum(np.logical_not(targets))
        recall = num_true_positives / np.sum(targets)
        
        print(f">> FPR: {fpr:0.4f}")
        print(f">> Recall: {recall:0.2f}")

model.compile(
    loss = 'binary_crossentropy',
    optimizer = SGD(lr = 1.0),
    metrics = ['accuracy'],
)

model.fit(
    training_X,
    training_y,
    epochs = 10,
    batch_size = 128,
    validation_data = (validation_X, validation_y),
    callbacks = [RecallRate()],
    verbose = 2
)

Using TensorFlow backend.


Train on 4181 samples, validate on 991 samples
Epoch 1/10
>> FPR: 0.0097
>> Recall: 0.37
 - 0s - loss: 1.1924 - acc: 0.6924 - val_loss: 0.6294 - val_acc: 0.7366
Epoch 2/10
>> FPR: 0.0097
>> Recall: 0.74
 - 0s - loss: 0.1897 - acc: 0.9294 - val_loss: 0.1347 - val_acc: 0.9586
Epoch 3/10
>> FPR: 0.0097
>> Recall: 0.85
 - 0s - loss: 0.1270 - acc: 0.9586 - val_loss: 0.1069 - val_acc: 0.9687
Epoch 4/10
>> FPR: 0.0097
>> Recall: 0.87
 - 0s - loss: 0.0976 - acc: 0.9687 - val_loss: 0.0926 - val_acc: 0.9657
Epoch 5/10
>> FPR: 0.0097
>> Recall: 0.90
 - 0s - loss: 0.0837 - acc: 0.9725 - val_loss: 0.0808 - val_acc: 0.9697
Epoch 6/10
>> FPR: 0.0097
>> Recall: 0.93
 - 0s - loss: 0.0721 - acc: 0.9766 - val_loss: 0.0736 - val_acc: 0.9707
Epoch 7/10
>> FPR: 0.0097
>> Recall: 0.91
 - 0s - loss: 0.0661 - acc: 0.9792 - val_loss: 0.0729 - val_acc: 0.9748
Epoch 8/10
>> FPR: 0.0097
>> Recall: 0.94
 - 0s - loss: 0.0593 - acc: 0.9809 - val_loss: 0.0637 - val_acc: 0.9788
Epoch 9/10
>> FPR: 0.0097
>> Recall: 0.94

<keras.callbacks.History at 0x11d718198>