In [14]:
import os, tarfile
import email
import numpy as np  
from sklearn.datasets import load_files
from sklearn.base import *
from sklearn.model_selection import StratifiedShuffleSplit
from email.parser import BytesParser, Parser
from email.policy import default
dataset_path = "dataset/"
only_easy_ham = True

def extract_bz2(filename, path="."):
    with tarfile.open(filename, "r:bz2") as tar:
        tar.extractall(path)

def removeHardHam(data, targets, hard_ham_index):
    _to_delete = []
    for _index in range(len(data)):
        if targets[_index] == int(hard_ham_index):
            _to_delete.append(_index)
        
    return np.delete(data, _to_delete),  np.delete(targets, _to_delete)


#The dataset has multiple labels such as easy_ham, easy_ham2,
# we want to simply convert to ham or spam
def binarizeLabels(targets):
    for _index in range(targets.size):
        if "ham" in targets[_index]:
            targets[_index] = "ham"
        else:
            targets[_index] = "spam"
            
    return targets
 
   
def getEmailPayload(email_message):
    body = []
    if email_message.is_multipart():
        for part in email_message.get_payload():
            body.append( part.get_payload() )
    else:
        body = email_message.get_payload()
    return body

def convertIndexToString(targets, index_to_string_dict):
    new_targets = []
    for _target in targets:
        new_targets.append( index_to_string_dict.get(_target) )
    
    return np.asarray(new_targets )

class PreprocessStrToEmail(BaseEstimator, TransformerMixin):
    def __init__(self, only_body = True): # no *args or **kargs
        self.only_body = only_body
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        _parser = Parser(policy=default)
        #parsestr(_data[0].decode('UTF-8'))
        _X = np.array(_parser.parsestr(x.decode('UTF-8')) for x in X)
        if self.only_body:
            X = np.array(getEmailPayload(x) for x in _X)
        else:
            pass
        return X

class PreprocessHardHam(BaseEstimator, TransformerMixin):
    def __init__(self, remove_hard_ham = True): # no *args or **kargs
        self.remove_hard_ham = remove_hard_ham
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        

In [2]:
os.chdir("SpamAssassin")
compressed_files = [x for x in os.listdir()  if x.endswith(".bz2")]
for _file in compressed_files:    
    extract_bz2(_file, path=dataset_path)

In [3]:
_dataset = load_files(dataset_path)
_data = np.array(_dataset.data)
_target = _dataset.target
_target_names =  _dataset.target_names


As we can see below, every data has a categorical label contained in the following set: 

In [4]:
print(_target_names)

['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']


In [5]:

if only_easy_ham:
    _hard_ham_index = _target_names.index('hard_ham')
    _data, _target = removeHardHam(_data, _target, _hard_ham_index)

It's desirable to simply set the labels as whether they are ham/spam:

In [6]:
index_to_str_label = dict()
zipped = zip(list(range(_target_names.__len__())), _target_names )
for x,y in zipped:
    index_to_str_label.update({x:y})

_target = convertIndexToString(_target,index_to_str_label )
_target = binarizeLabels(_target)

print(_target[:16])


['spam' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham']


In [7]:
print(_data[0])


b'From zzzz-latestdodgydotcomstock@jmason.org  Mon Sep  2 16:27:31 2002\nReturn-Path: <zzzz-latestdodgydotcomstock@spamassassin.taint.org>\nDelivered-To: zzzz@localhost.spamassassin.taint.org\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 75B1143F99\n\tfor <zzzz@localhost>; Mon,  2 Sep 2002 11:26:17 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Mon, 02 Sep 2002 16:26:19 +0100 (IST)\nReceived: from c448022-c (12-243-62-67.client.attbi.com [12.243.62.67]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with SMTP id g7V2kdZ20239 for\n    <zzzz-latestdodgydotcomstock@jmason.org>; Sat, 31 Aug 2002 03:46:40 +0100\nMessage-Id: <200208310246.g7V2kdZ20239@dogma.slashnull.org>\nFrom: "zzzz-latestdodgydotcomstock" <yyyy-latestdodgydotcomstock@spamassassin.taint.org>\nTo: "zzzz-latestdodgydotcomstock" <yyyy-latestdodgydotcomstock@spamassassin.taint.org>\

In [8]:
headers = Parser(policy=default).parsestr(_data[0].decode('UTF-8'))

In [9]:
print('To: {}'.format(headers['to']))


To: zzzz-latestdodgydotcomstock <yyyy-latestdodgydotcomstock@spamassassin.taint.org>


In [10]:
print('From: {}'.format(headers['from']))


From: zzzz-latestdodgydotcomstock <yyyy-latestdodgydotcomstock@spamassassin.taint.org>


In [11]:
elem = headers.get_payload()
type(elem)
print(elem[-1])

Content-Type: text/plain
Content-Transfer-Encoding: base64

UGxlYXNlIGZvcmdpdmUgdGhlIGludHJ1c2lvbiwgdGhpcyBpcyBhIG9uZSB0aW1lIG9ubHkg
dGVzdCwgcGxlYXNlIGRlbGV0ZS4gDQpZb3Ugc2hvdWxkIG5vdCByZWNlaXZlIGFueSBhZGRp
dGlvbmFsIGVtYWlscyBmcm9tIHRoaXMgYWRkcmVzcywgaWYgeW91IGRvIHBsZWFzZSBzZW5k
IGFuIGVtYWlsIA0Kd2l0aCByZW1vdmUgYXMgc3ViamVjdCB0bzoNCg0KdGVzdDc5OTFAeWFo
b28uY29tDQoNClRoYW5rIHlvdSBmb3IgeW91ciB1bmRlcnN0YW5kaW5nLg0KDQogICAg


In [12]:
#Let's see the ratio of spam to ham:
spam_occurrences = np.count_nonzero(_target == 'spam')
ham_occurrences = np.count_nonzero(_target == 'ham')
print(spam_occurrences/ham_occurrences)

0.4866735007688365


In [15]:
pre_process = PreprocessStrToEmail()
pre_process.fit_transform(_data)

TypeError: iteration over a 0-d array

In [8]:
#Using stratifies shuffle to obtain a roughly similar ratio:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(_data, _target):
    X_strat_train_set = _data[train_index]
    X_strat_test_set =  _data[test_index]
    Y_strat_train_set = _target[train_index]
    Y_strat_test_set = _target[test_index]

In [10]:
#OK, the train set does accurately reflects the data
spam_occurrences = np.count_nonzero(Y_strat_train_set == 'spam')
ham_occurrences = np.count_nonzero(Y_strat_train_set == 'ham')
print(spam_occurrences/ham_occurrences)

0.4867029798141621
