In [1]:
import pandas as pd
import numpy as np
import math

# Preprocessing for the Email Classifier

Before constructing the Naive Bayes Classifier, it is convenient to make some transformations to the training and test sets so that they are easier to work with. We want to have each set use a bag of words representation for each message body based on the most important words. To find the most important words, we use information gain.

## Information Gain

Information gain tells us how important a given feature is to a dataset. To understand information gain, we must first understand entropy. Entropy is defined as

$$
E = \sum_i{-p_i \cdot \text{log}_2(p_i)},
$$

where $p_i$ is the probability of an observation having class $i.$ Entropy basically tells us how mixed the data is for a certain feature. It varies from $0$ to $1$, where $0$ represents a feature where all the data belongs to one class and $1$ represents a feature where the data is evenly split between classes.

The information gain of the feature is a measure of how much entropy is explained by that feature if we were to split the data based on that feature. It is defined as 

$$
IG = E(f) - \sum_{i=1}^d{p(f=i) \cdot E(f=i)},
$$

where $f$ is the feature in question and $\{0, 1, \dots, d\}$ are the different classes that the feature can take. $E(f=i)$ gives the entropy of a particular split of the feature. The information gain will be high when the entropy of the entire feature is high but the entropy of its splits are low. This means that splitting a feature with a high information gain will explain a lot of the entropy in the data.

## Bag of Words

Before we can do anything, we must transform the body of each message into a bag of words representation where `body[word]` gives the count of how many times that word appeared in the message. Initially, we consider every word found within the dataset. Punctuation in each message is replaced with whitespace and then the message is tokenized and whitespace is stripped. 

In [2]:
train_data = pd.read_hdf('data/enron/spam_filter_train.h5')
test_data = pd.read_hdf('data/enron/spam_filter_test.h5')

In [3]:
together = pd.concat([train_data, test_data])

together['BODY'] = together['BODY'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
together['BODY'] = together['BODY'].str.lower().str.ljust(1).str.rjust(1)
vocabulary = list(set(together['BODY'].str.split().sum()))

In [4]:
word_counts_per_message = pd.DataFrame(0, index=together.index, columns=vocabulary)
word_counts_per_message = word_counts_per_message.apply(lambda series: together['BODY'].str.count(r' %s ' % series.name))

together = pd.concat([together, word_counts_per_message], axis=1)

train_data = together.iloc[train_data.index]
test_data = together.iloc[test_data.index]

train_data

Unnamed: 0,BODY,SPAM,soiicitation,911,guaranties,gpb,cecropia,9793,hackney,antre,...,tremble,endless,estoppel,exports,diphthong,iwdgqreo,marconi,beverly,tagg,431705
0,subject escalation procedures gas logistics de...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,subject,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,subject king ranch i just spoke briefly with b...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,subject fw intrastate gas stacy could we add t...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,subject soaring microcap moving quickly stock ...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4650,subject ena 202 and hpl 216 transport contract...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4651,subject enron hpl actuals for sept 13 2000 tec...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4652,subject cornhusker for sept i was informed by ...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4653,subject day 26 txu lonestar called on 20000 at...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Most Important Words

We find the information gain for every word in the set. We then sort the words by their information gain and take the first 500. In addition to this, we do not even consider words that appear in less than 5 emails.

In [5]:
def information_gain(data):
    np.seterr('ignore')
    
    words_present = train_data.drop(['BODY', 'SPAM'], axis=1) > 0
    words_present = words_present.drop(words_present.columns[words_present.apply(lambda col: np.sum(col) < 5)], axis=1)
    
    proportion_spam = words_present.iloc[data['SPAM'].array].sum() / len(data.index)
    proportion_ham = 1. - proportion_spam
    
    whole_entropy = -1 * (proportion_spam * np.log2(proportion_spam) + proportion_ham * np.log2(proportion_ham))
    whole_entropy = whole_entropy.replace([-np.inf, np.nan], 0.)
    
    entropy_word, entropy_not_word = entropy(words_present, data['SPAM'].array)

    proportion_word = words_present.sum() / len(data.index)
    proportion_not_word = 1. - proportion_word
    
    np.seterr('warn')
    
    return whole_entropy - (proportion_word * entropy_word + proportion_not_word * entropy_not_word)

def entropy(words_present, spam_mask):
    words_count = words_present.sum()
    words_count_complement = 1 - words_count
    
    proportion_spam_word = np.logical_and(words_present, np.transpose([spam_mask])).sum() / words_count
    proportion_ham_word = np.logical_and(words_present, np.logical_not(np.transpose([spam_mask]))).sum() / words_count
    
    entropy_word = -1 * (proportion_spam_word * np.log2(proportion_spam_word) + proportion_ham_word * np.log2(proportion_ham_word))
    entropy_word = entropy_word.replace([-np.inf, np.nan], 0.)
    
    proportion_spam_not_word = np.logical_and(np.logical_not(words_present), np.transpose([spam_mask])).sum() / words_count_complement
    proportion_ham_not_word = np.logical_and(np.logical_not(words_present), np.logical_not(np.transpose([spam_mask]))).sum() / words_count_complement
    
    entropy_not_word = -1 * (proportion_spam_not_word * np.log2(proportion_spam_not_word) + proportion_ham_not_word * np.log2(proportion_ham_not_word))
    entropy_not_word = entropy_not_word.replace([-np.inf, np.nan], 0.)
    
    return entropy_word, entropy_not_word

In [13]:
information_gain_table = information_gain(train_data)

most_important_words = information_gain_table.sort_values(ascending=False)[:1000].index

train_data_important = train_data[np.append(['SPAM'], most_important_words)]
test_data_important = test_data[np.append(['SPAM'], most_important_words)]

train_data_important

Unnamed: 0,SPAM,http,your,more,here,no,our,all,www,com,...,reform,pr,illustrator,otcbb,assurance,opinions,macromedia,revenues,tax,value
0,False,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,2,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,True,0,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4650,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4651,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4652,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4653,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We save these dataframes for later use.

In [14]:
train_data_important.to_hdf('data/enron/spam_filter_train_preprocessed.h5', key='spam_filter_train_preprocessed', mode='w')
test_data_important.to_hdf('data/enron/spam_filter_test_preprocessed.h5', key='spam_filter_test_preprocessed', mode='w')