In [2]:
import requests
import tarfile
import numpy as np
from io import BytesIO
import re
import pandas as pd
from os import path
import os
from pathlib import Path

We will use the first 3 ENRON datasets, reserving the last two sets for training the hamifiers/spamifiers. For the two splits, 10% will go into testing and the rest will be used for training. 

First, we need to be able to download the datasets from http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/.

In [3]:
url = 'http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed'


def extract_to_numpy(set_path):
    tar_file = requests.get(url + set_path, allow_redirects=True)
    
    tar_file = tarfile.open(fileobj=BytesIO(tar_file.content))
    
    message_bodies = []
    message_labels = []
    
    def read_from_subdirectory(subdirectory):
        is_spam = subdirectory == 'spam/'
        
        spam_files = [
            tarinfo for tarinfo in tar_file.getmembers()
            if re.match('\w+/' + subdirectory + '\w+', tarinfo.name)
        ]

        for spam_file in spam_files:
            text_file = tar_file.extractfile(spam_file)

            message_body = text_file.read()
            
            # Body may contain non ascii characters, which we want to remove.
            message_body = ''.join(chr(char) for char in message_body if char<128)

            message_bodies.append(message_body)
            message_labels.append(is_spam)
    
    read_from_subdirectory('spam/')
    read_from_subdirectory('ham/')
    
    message_bodies = np.array(message_bodies, dtype=str)
    message_labels = np.array(message_labels, dtype=bool)
    
    return message_bodies, message_labels

We use the first dataset for the spam filter:

In [3]:
def save_dataframe(dataframe, name, key):
    if not path.isdir(Path('data/')):
        # We are the first to store data, create directory.
        os.mkdir('data')
    
    if not path.isdir(Path('data/enron/')):
        # We are the first to store ENRON data, create directory.
        os.mkdir('data/enron')
    
    # If file already exists, we are overwriting it here.
    dataframe.to_hdf('data/enron/' + name, key=key, mode='w')

In [20]:
message_bodies, message_labels = extract_to_numpy('/enron1.tar.gz')

spam_filter_data = pd.DataFrame(data={'BODY': message_bodies, 'SPAM': message_labels})

spam_filter_train = spam_filter_data.sample(frac=0.9, random_state=100) # random_state is a seed value, don't change unless you want to refresh the splits
spam_filter_test = spam_filter_data.drop(spam_filter_train.index)

spam_filter_train = spam_filter_train.reset_index(drop=True)
spam_filter_test = spam_filter_test.reset_index(drop=True)
spam_filter_test.index = list(map(lambda i: i + len(spam_filter_train.index), spam_filter_test.index))

save_dataframe(spam_filter_train, 'spam_filter_train.h5', 'spam_filter_train')
save_dataframe(spam_filter_test, 'spam_filter_test.h5', 'spam_filter_test')

Statistics for splits:

In [27]:
train_spam_proportion = spam_filter_train['SPAM'].value_counts(normalize=True)[True] * 100
test_spam_proportion = spam_filter_test['SPAM'].value_counts(normalize=True)[True] * 100

print('Percentage of spam in training set: ' + f'{train_spam_proportion:.1f}' + '%')
print('Percentage of spam in testing set: ' + f'{test_spam_proportion:.1f}' + '%')

Percentage of spam in training set: 29.1%
Percentage of spam in testing set: 27.9%


We use the other two datasets for the hamifiers/spamifiers:

In [4]:
message_bodies, message_labels = extract_to_numpy('/enron3.tar.gz')

hamifier_data = pd.DataFrame(data={'BODY': message_bodies, 'SPAM': message_labels})
del message_bodies
del message_labels

hamifier_train = hamifier_data.sample(frac=0.9, random_state=100)
hamifier_test = hamifier_data.drop(hamifier_train.index)

hamifier_train = hamifier_train.reset_index(drop=True)
hamifier_test = hamifier_test.reset_index(drop=True)
hamifier_test.index = list(map(lambda i: i + len(hamifier_train.index), hamifier_test.index))

save_dataframe(hamifier_train, 'hamifier_train.h5', 'hamifier_train')
save_dataframe(hamifier_test, 'hamifier_test.h5', 'hamifier_test')

In [5]:
train_spam_proportion = hamifier_train['SPAM'].value_counts(normalize=True)[True] * 100
test_spam_proportion = hamifier_test['SPAM'].value_counts(normalize=True)[True] * 100

print('Percentage of spam in training set: ' + f'{train_spam_proportion:.1f}' + '%')
print('Percentage of spam in testing set: ' + f'{test_spam_proportion:.1f}' + '%')

Percentage of spam in training set: 27.4%
Percentage of spam in testing set: 26.0%
