In [1]:
import os
import os.path

DATA_DIR = os.path.join(
    os.getcwd(),
    "data/"
)


In [2]:
import os

# Simple email class. All it does is really allow you to read an email's text.
class Email:
    def __init__(self, path, label):
        self.path = path
        self.label = label

    def text_content(self):
        return type(self).read_text_content(self.path)
    
    def word_counts(self):
        counts = {}
        for word in self.text_content().split():
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
            
        return counts

    @classmethod
    def read(cls, path, label):
        return Email(
            path = path,
            label = label
        )

    @classmethod
    def read_text_content(cls, path):
        full_path = os.path.join(DATA_DIR, path)
        # Grr! Emails are encoded in Latin-1, not UTF-8. Python
        # (rightly) freaks out.
        with open(full_path, "r", encoding = "iso-8859-1") as f:
            try:
                return f.read()
            except:
                print(f"Error with: {path}")
                raise


In [3]:
import numpy as np
from sortedcontainers import SortedSet

# This counts how many emails each word occurs in.
def build_word_reaches(ham_emails, spam_emails):
    word_reaches = {}
    for emails in (ham_emails, spam_emails):
        for email in emails:
            for (word, _) in email.word_counts().items():
                if word not in word_reaches:
                    word_reaches[word] = 0
                # No matter how frequent in the email, only counts once per email.
                word_reaches[word] += 1
                
    return word_reaches

# Throw away those emails that don't occur in at least 100 emails.
# Throwing out low reach features means:
# (1) Less chance for overfitting
# (2) Smaller feature vectors, faster, less memory use.
def filter_words(word_reaches, limit = 100):
    filtered_words = SortedSet()
    for (word, word_reach) in word_reaches.items():
        if word_reach >= limit:
            filtered_words.add(word)

    return filtered_words

# Bidirectional map. Limits to just filtered words, though.
class FilteredWordEncodingDictionary:
    def __init__(self, filtered_words):
        self.word_to_code_dict = {}
        self.code_to_word_dict = {}

        for word in filtered_words:
            self.insert_word(word)

    # Only meant to be called when constructing the dictionary.
    def insert_word(self, word):
        if word not in self.word_to_code_dict:
            code = len(self.word_to_code_dict)
            self.word_to_code_dict[word] = code
            self.code_to_word_dict[code] = word

    def word_to_code(self, word):
        if word not in self.word_to_code_dict:
            return None

        return self.word_to_code_dict[word]

    def code_to_word(self, code):
        if code not in self.code_to_word_dict:
            raise f"Code {code} not recorded!"

        return self.code_to_word_dict[code]

    # This returns a vector of ones and zeros.
    def encode_text(self, text):
        codes = np.zeros(len(self.code_to_word_dict))

        for word in text.split():
            code = self.word_to_code(word)
            if code is not None:
                codes[code] = 1.0

        return codes
    
    def __len__(self):
        return len(self.code_to_word_dict)

In [4]:
# This is a simple subclass of Email that just encodes the words in an email.
class EncodedEmail(Email):
    def __init__(self, path, label, word_encoding_dictionary):
        super().__init__(path, label)

        self.codes = (
            word_encoding_dictionary.encode_text(
                self.text_content()
            )
        )


In [5]:
import os.path
import pickle

class Dataset:
    DATA_FILE_PATH = os.path.join(DATA_DIR, 'lr_data.p')
    WORD_REACH_LIMIT = 100

    def __init__(
            self, word_encoding_dictionary, encoded_ham_emails, encoded_spam_emails
    ):
        self.word_encoding_dictionary = word_encoding_dictionary
        self.encoded_ham_emails = encoded_ham_emails
        self.encoded_spam_emails = encoded_spam_emails
        
    def __len__(self):
        return len(self.encoded_ham_emails) + len(self.encoded_spam_emails)
    
    @classmethod
    def encode(cls, ham_emails, spam_emails):
        # Count words, select which we will keep.
        word_reaches = build_word_reaches(ham_emails, spam_emails)
        filtered_words = filter_words(word_reaches, limit = cls.WORD_REACH_LIMIT)
        
        # Assign codes to all words.
        word_encoding_dictionary = FilteredWordEncodingDictionary(filtered_words)
        
        # Encode each email as a vector of ones and zeros.
        encoded_ham_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in ham_emails
        ]
        encoded_spam_emails = [
            EncodedEmail(e.path, e.label, word_encoding_dictionary)
            for
            e in spam_emails
        ]
        
        # Construct the object!
        return cls(
            word_encoding_dictionary,
            encoded_ham_emails,
            encoded_spam_emails
        )

    INSTANCE = None
    @classmethod
    def get(cls):
        if not cls.INSTANCE:
            with open(cls.DATA_FILE_PATH, 'rb') as f:
                cls.INSTANCE = pickle.load(f)
        return cls.INSTANCE


In [6]:
import os
import os.path
import pickle
from urllib.request import urlretrieve

ENRON_SPAM_URL = (
    "http://csmining.org/index.php/"
    "enron-spam-datasets.html"
    "?file=tl_files/Project_Datasets/Enron-Spam%20datasets/Preprocessed"
    "/enron1.tar.tar"
)

TAR_FILE_NAME = "enron1.tar.tar"
ENRON_DATA_DIR_NAME = "enron1"

def download_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    if os.path.isfile(tarfile_path):
        print("Tarfile already downloaded!")
        return

    print("Downloading enron1.tar.tar")
    urlretrieve(ENRON_SPAM_URL, tarfile_path)
    print("Download complete!")

def extract_tarfile():
    tarfile_path = os.path.join(DATA_DIR, TAR_FILE_NAME)
    enron_data_dir = os.path.join(DATA_DIR, ENRON_DATA_DIR_NAME)
    if os.path.isdir(enron_data_dir):
        print("Tarfile already extracted!")
        return

    print("Extracting enron1.tar.tar")
    os.system(f"tar -xf {tarfile_path} -C {DATA_DIR}")
    print("Extraction complete!")

def read_emails_dir(path, label):
    emails = []
    for email_fname in os.listdir(os.path.join(DATA_DIR, path)):
        email_path = os.path.join(path, email_fname)
        email = Email.read(
            path = email_path,
            label = label
        )
        emails.append(email)

    return emails

def build_dataset():
    ham_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "ham"),
        label = 0
    )
    spam_emails = read_emails_dir(
        path = os.path.join(ENRON_DATA_DIR_NAME, "spam"),
        label = 1
    )

    return Dataset.encode(
        ham_emails = ham_emails,
        spam_emails = spam_emails
    )

def save_dataset(dataset):
    with open(Dataset.DATA_FILE_PATH, "wb") as f:
        pickle.dump(dataset, f)

def build_and_save_dataset():
    if os.path.isfile(Dataset.DATA_FILE_PATH):
        print("Dataset already processed!")
        return

    print("Reading and processing emails!")
    dataset = build_dataset()
    save_dataset(dataset)
    print("Dataset created!")

download_tarfile()
extract_tarfile()
build_and_save_dataset()


Tarfile already downloaded!
Tarfile already extracted!
Dataset already processed!


In [7]:
def logistic(z):
    return 1 / (1 + np.exp(-z))

class LogisticRegressionModel:
    def __init__(self, word_encoding_dictionary):
        self.theta = np.zeros(
            len(word_encoding_dictionary)
        )
        
    def prob(self, codes):
        return logistic(self.theta.dot(codes))
    
    def error(self, email):
        if email.label == 0:
            return -np.log(1 - self.prob(email.codes))
        else:
            return -np.log(self.prob(email.codes))

    def partial_derivatives(self, email):
        if email.label == 0:
            # This is a vectorized version.
            return (
                email.codes * self.prob(email.codes)
            )
        else:
            return (
                -email.codes * (1 - self.prob(email.codes))
            )


In [8]:
import zlib

class DatasetSplitter:
    @classmethod
    def split(cls, dataset, ratio):
        datasetA = cls._split(dataset, ratio, 0)
        datasetB = cls._split(dataset, ratio, 1)
        return (datasetA, datasetB)

    @classmethod
    def _split(cls, dataset, ratio, mode):
        split_encoded_ham_emails, split_encoded_spam_emails = [], []
        emails_pairs = [
            (dataset.encoded_ham_emails, split_encoded_ham_emails),
            (dataset.encoded_spam_emails, split_encoded_spam_emails)
        ]

        for (emails, split_emails) in emails_pairs:
            for email in emails:
                # This is a fancy way to pseudorandomly but
                # deterministically select emails. That way we always
                # pick the same set of emails for reproducability
                # across program runs.
                h = zlib.crc32(email.path.encode())
                p = h / (2**32 - 1)
                if (mode == 0 and p < ratio) or (mode == 1 and p >= ratio):
                    split_emails.append(email)

        return Dataset(
            dataset.word_encoding_dictionary,
            encoded_ham_emails = split_encoded_ham_emails,
            encoded_spam_emails = split_encoded_spam_emails
        )


In [9]:
class Trainer:
    def __init__(self, training_dataset, test_dataset, learning_rate):
        self.training_dataset, self.test_dataset = training_dataset, test_dataset
        self.learning_rate = learning_rate
        self.model = LogisticRegressionModel(training_dataset.word_encoding_dictionary)
        
    def error(self, dataset):
        error = 0.0
        for emails in (dataset.encoded_ham_emails, dataset.encoded_spam_emails):
            for email in emails:
                error += self.model.error(email)

        # Otherwise it isn't fair because longer sets have more error.
        return error / len(dataset)
    
    def partial_derivatives(self):
        partials = np.zeros(len(self.training_dataset.word_encoding_dictionary))
        for emails in (self.training_dataset.encoded_ham_emails, self.training_dataset.encoded_spam_emails):
            for email in emails:
                partials += self.model.partial_derivatives(email)
                
        return partials

    def train_step(self):
        self.model.theta -= self.learning_rate * self.partial_derivatives()
        print(f"Train Error: {self.error(self.training_dataset):0.2f}")
        print(f"Test Error: {self.error(self.test_dataset):0.2f}")
    

In [10]:
DATASET = Dataset.get()
(training_set, test_set) = DatasetSplitter.split(DATASET, 0.80)

trainer = Trainer(
    training_set,
    test_set,
    learning_rate = 0.001
)
for _ in range(100):
    trainer.train_step()


Train Error: 3.02
Test Error: 2.63
Train Error: 0.66
Test Error: 0.67
Train Error: 2.37
Test Error: 2.06
Train Error: 0.71
Test Error: 0.70
Train Error: 1.40
Test Error: 1.23
Train Error: 0.95
Test Error: 0.96
Train Error: 1.15
Test Error: 1.00
Train Error: 0.64
Test Error: 0.63
Train Error: 0.30
Test Error: 0.27
Train Error: 0.21
Test Error: 0.21
Train Error: 0.15
Test Error: 0.14
Train Error: 0.13
Test Error: 0.13
Train Error: 0.12
Test Error: 0.12
Train Error: 0.12
Test Error: 0.12
Train Error: 0.11
Test Error: 0.11
Train Error: 0.11
Test Error: 0.11
Train Error: 0.10
Test Error: 0.11
Train Error: 0.10
Test Error: 0.11
Train Error: 0.10
Test Error: 0.10
Train Error: 0.10
Test Error: 0.10
Train Error: 0.09
Test Error: 0.10
Train Error: 0.09
Test Error: 0.10
Train Error: 0.09
Test Error: 0.10
Train Error: 0.09
Test Error: 0.10
Train Error: 0.09
Test Error: 0.09
Train Error: 0.08
Test Error: 0.09
Train Error: 0.08
Test Error: 0.09
Train Error: 0.08
Test Error: 0.09
Train Error: 0.08
Te

In [13]:
import numpy as np

# Helper class (see below)
class RecallResult:
    def __init__(self, score_cutoff, num_spams_identified, recall):
        self.score_cutoff, self.num_spams_identified, self.recall = (
            score_cutoff, num_spams_identified, recall
        )

# Determines what percentage of spam emails are detected if we can tolerate a given false positive rate.
# Does this for multiple false positive rate limits.
def recall_for_false_positive_rates(model, dataset, limits):
    ham_scores = list(map(
        lambda email: model.prob(email.codes),
        dataset.encoded_ham_emails
    ))
    ham_scores.sort(key = lambda score: -score)
    spam_scores = list(map(
        lambda email: model.prob(email.codes),
        dataset.encoded_spam_emails
    ))

    def calculate_result(limit):
        score_cutoff = ham_scores[int(len(ham_scores) * limit)]
        num_spams_identified = sum(
            [1 if s > score_cutoff else 0 for s in spam_scores]
        )
        recall = (
            num_spams_identified / len(dataset.encoded_spam_emails)
        )

        return RecallResult(
            score_cutoff = score_cutoff,
            num_spams_identified = num_spams_identified,
            recall = recall,
        )

    return [
        (limit, calculate_result(limit)) for limit in limits
    ]

In [16]:
FALSE_POSITIVE_RATES = [0.001, 0.01, 0.02, 0.04, 0.08, 0.16]
results = recall_for_false_positive_rates(
    trainer.model,
    test_set,
    FALSE_POSITIVE_RATES
)

for (false_positive_rate, result) in results:
    print(f"False Positive Rate {false_positive_rate:0.3f} | Recall {result.recall:0.2f}")

False Positive Rate 0.001 | Recall 0.65
False Positive Rate 0.010 | Recall 0.90
False Positive Rate 0.020 | Recall 0.98
False Positive Rate 0.040 | Recall 0.99
False Positive Rate 0.080 | Recall 1.00
False Positive Rate 0.160 | Recall 1.00
