In [22]:
import os
import tarfile
import html2text
import string
import re
from nltk.stem.porter import *

from email.parser import Parser
from email.policy import default
from sklearn.base import *
from sklearn.datasets import load_files
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score

dataset_path = "dataset/"
url_pattern = r'https?://\S+'
number_pattern= r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?'
only_easy_ham = False
email_parser = None
htmlconverter = None
stemmer = None

def extract_bz2(filename, path="."):
    with tarfile.open(filename, "r:bz2") as tar:
        tar.extractall(path)

def removeHardHam(data, targets, hard_ham_index):
    _to_delete = []
    for _index in range(len(data)):
        if targets[_index] == int(hard_ham_index):
            _to_delete.append(_index)
        
    return np.delete(data, _to_delete),  np.delete(targets, _to_delete)


#The dataset has multiple labels such as easy_ham, easy_ham2,
# we want to simply convert to ham or spam
def binarizeLabels(targets):
    for _index in range(targets.size):
        if "ham" in targets[_index]:
            targets[_index] = "ham"
        else:
            targets[_index] = "spam"
            
    return targets

"""
processMessageObj 
Helper function only process text/plain or text/html. Images/gifs 
or other multimedia are not useful. Multiparts are also not
useful because they will be visited later with walk()

"""
def processMessageObj(message_object):
    _type = message_object.get_content_type()
    _body = None
    if _type == "text/plain":
        _body = message_object.get_payload(decode=True)
        _body = _body.decode('latin-1')
    elif _type == "text/html":
        #TODO preprocess html
        _body = message_object.get_payload(decode=True)
        _body = _body.decode('latin-1')
        _body = transformHTMLtoPlain(_body)

    return _body

def transformHTMLtoPlain(_str, ignore_images=True):
    global htmlconverter

    if htmlconverter is None:
        htmlconverter = html2text.HTML2Text()

    if ignore_images:
        htmlconverter.ignore_images = True


    return htmlconverter.handle(_str)


def stemmfy(message):
    global stemmer
    _new_body = None
    if  stemmer is None:
        stemmer = PorterStemmer()

    for word in message.split():
        temp = stemmer.stem(word)
        if _new_body is None:
            _new_body = temp
        else:
            _new_body = _new_body + " " + temp

    return _new_body


def getEmailSubject(email_message ):
    global email_parser

    if email_parser is None:
        email_parser = Parser(policy=default)

    email_message = email_parser.parsestr(email_message.decode('iso-8859-1')) #UTF-8 does not work (some email have latin chars

    return email_message["subject"]

"""
Processes the email "payload", that is, the actual message. This processing includes parsing the byte stream
into an email object, thenn decoding it and finally walking through the "sections"

input: email_message = a numpy array element, representing a byte stream

"""
def processEmailPayload(email_message, to_lower_case, stemm=True, remove_punct = True, substitute_number=True):
    body = ""
    global email_parser

    if email_parser is None:
        email_parser = Parser(policy=default)

    email_message = email_parser.parsestr(email_message.decode('iso-8859-1'))
    #UTF-8 does not work (some email have latin chars)



    if email_message.is_multipart():
        for part in email_message.walk():
            _temp = processMessageObj(part)
            if _temp is not None:
                body = body + _temp

    else:
        _temp  =  processMessageObj(email_message)
        body = body + _temp if _temp is not None else body


    if to_lower_case:
        body = body.lower()

    #URLS are changed always (hardcoded) it makes no sense to allow them (too much noise)
    body = re.sub(pattern=url_pattern, repl=' _URL_ ', string=body)

    
    if substitute_number:
        body = re.sub(pattern=number_pattern, repl='NUMBER', string=body)
        
    if remove_punct:
        body = body.translate((str.maketrans('', '', string.punctuation)))



        
    if stemm:
        body = stemmfy(body)
    #If the email message is a html message for instance, the body will be reduced to "none", then simply convert it
    #to a null string (to not crash further steps)...
    return body if body is not None else ""



def convertIndexToString(targets, index_to_string_dict):
    new_targets = []
    for _target in targets:
        new_targets.append( index_to_string_dict.get(_target) )
    
    return np.asarray(new_targets )

class PreprocessStrToEmail(BaseEstimator, TransformerMixin):
    def __init__(self,   to_lower_case= False, stemm=True, remove_punct=True, \
                 substitute_number = True): # no *args or **kargs
        
        self.to_lower_case = to_lower_case
        self.stemm = stemm
        self.remove_punct = remove_punct
        self.substitute_number =  substitute_number

    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        _parser = Parser(policy=default)

        return np.array([processEmailPayload(xi, self.to_lower_case, self.stemm, self.remove_punct \
                                            ,   self.substitute_number ) for xi in X])
 




        


In [2]:
os.chdir("SpamAssassin")
compressed_files = [x for x in os.listdir()  if x.endswith(".bz2")]
for _file in compressed_files:    
    extract_bz2(_file, path=dataset_path)

In [3]:
_dataset = load_files(dataset_path)
_data = np.array(_dataset.data)
_target = _dataset.target
_target_names =  _dataset.target_names



As we can see below, every data has a categorical label contained in the following set: 

In [4]:
print(_target_names)


['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']


In [5]:
if only_easy_ham:
    _hard_ham_index = _target_names.index('hard_ham')
    _data, _target = removeHardHam(_data, _target, _hard_ham_index)


It's desirable to simply set the labels as whether they are ham/spam:

In [6]:
index_to_str_label = dict()
zipped = zip(list(range(_target_names.__len__())), _target_names )
for x,y in zipped:
    index_to_str_label.update({x:y})

_target = convertIndexToString(_target,index_to_str_label )
_target = binarizeLabels(_target)

print(_target[:16])


['spam' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham']


As we can see below there is a lot of noise in the message, such as the protocol used which IP sent the message, etc...

In [7]:
print(_data[164])

b"From razor-users-admin@lists.sourceforge.net  Wed Aug 14 10:48:36 2002\nReturn-Path: <razor-users-admin@example.sourceforge.net>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 7FECA43C56\n\tfor <jm@localhost>; Wed, 14 Aug 2002 05:46:04 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Wed, 14 Aug 2002 10:46:04 +0100 (IST)\nReceived: from usw-sf-list2.sourceforge.net (usw-sf-fw2.sourceforge.net\n    [216.136.171.252]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id\n    g7DJEY414900 for <jm-razor@jmason.org>; Tue, 13 Aug 2002 20:14:34 +0100\nReceived: from usw-sf-list1-b.sourceforge.net ([10.3.1.13]\n    helo=usw-sf-list1.sourceforge.net) by usw-sf-list2.sourceforge.net with\n    esmtp (Exim 3.31-VA-mm2 #1 (Debian)) id 17egxh-0005RG-00; Tue,\n    13 Aug 2002 12:04:13 -0700\nReceived: from dhcp024-208-1

The preprocessing will take care to only get the message proper and the subject if the user wants. The other preprocessing hyperparameters are whether to convert to lower case, convert numbers to "number", stemm the words and remove punctuation. Below is a fully processed email message.

In [8]:
pre_process = PreprocessStrToEmail(to_lower_case=True, stemm=True, remove_punct = True )
_data_processed_example = pre_process.fit_transform([_data[164]])

In [9]:
print(_data_processed_example)

['on tue number aug number david raistrick wrote to actual answer justin question one can assum that he ha rewritesubject and reporthead turn on becaus he want themand that he would like to be abl to strip the ad bit off befor he send them to razor someth as simpl as the follow would probabl work just fine just pipe your messag through thi then on into razorreport i wouldnt make that assumpt id assum that rewritesubject wa on and reporthead wa off becaus that the default configur and not everyon know to go look in the userpref file to make those chang thi sfnet email is sponsor by dice the lead onlin job board for hightech profession search and appli for tech job today url razorus mail list razoruserslistssourceforgenet url']


Below is a less preprocessed message. As we can see there is significantly more noise.

In [10]:
pre_process_weaker = PreprocessStrToEmail(to_lower_case=False, stemm=False, remove_punct = False, substitute_number=False)
_data_processed_example = pre_process_weaker.fit_transform([_data[164]])

In [11]:
print(_data_processed_example)

["On Tue, 13 Aug 2002, David Raistrick wrote:\n\n> To actually answer Justin's question, (one can assume that he has\n> rewrite_subject and report_header turned on because he wants them..and\n> that he would like to be able to strip the added bits off before he sends\n> them to razor) something as simple as the following would probably work\n> just fine.  Just pipe your message through this, then on into\n> razor-report:\n\nI wouldn't make that assumption.  I'd assume that rewrite_subject was on, \nand report_header was off, because that's the default configuration, and \nnot everyone knows to go look in the user_prefs file to make those \nchanges.\n\n\n\n-------------------------------------------------------\nThis sf.net email is sponsored by: Dice - The leading online job board\nfor high-tech professionals. Search and apply for tech jobs today!\n _URL_ \n_______________________________________________\nRazor-users mailing list\nRazor-users@lists.sourceforge.net\n _URL_ \n\n\n"]


Let's split the train and test set according to the stratified strategy. As a comparison, let's get original the ratio between span/ham:

In [12]:
num_spam = np.count_nonzero(_target == 'spam') 
num_ham = np.count_nonzero(_target == 'ham')
print("Ratio between spam and ham is: " + str(num_spam/num_ham))

Ratio between spam and ham is: 0.4572598121839634


In [13]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(_data, _target):
    X_strat_train_set = _data[train_index]
    X_strat_test_set =  _data[test_index]
    Y_strat_train_set = _target[train_index]
    Y_strat_test_set = _target[test_index]

As we can see after the shuffle the ratio is almost equal

In [14]:
num_spam = np.count_nonzero(Y_strat_train_set == 'spam') 
num_ham = np.count_nonzero(Y_strat_train_set == 'ham')
print("Ratio between spam and ham in train set after \
stratified shuffle is: " + str(num_spam/num_ham))

Ratio between spam and ham in train set after stratified shuffle is: 0.45725466586393737


Let's create a preprocess_pipeline. The first transforme is the shown above. The second is CountVectorizer that simply builds a dictionary from each word present in the array and assigns a frequency to it.

In [15]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", PreprocessStrToEmail(to_lower_case=True, stemm=True, remove_punct = True)),
    ("count_vectorizer", CountVectorizer(lowercase=False)),
])

In [16]:
X_strat_train_set = preprocess_pipeline.fit_transform(X_strat_train_set)

In [17]:
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42, n_jobs=2)
score = cross_val_score(log_clf, X_strat_train_set, Y_strat_train_set, cv=3, verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.979) total time=   3.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV] END ................................ score: (test=0.977) total time=   1.8s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.3s remaining:    0.0s


[CV] END ................................ score: (test=0.971) total time=   1.8s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.1s finished


Testing with the test dataset:

In [18]:
X_test_transformed = preprocess_pipeline.transform(X_strat_test_set)

In [20]:
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42, n_jobs=2)
score = log_clf.fit( X_strat_train_set, Y_strat_train_set)

In [21]:
y_pred = log_clf.predict(X_test_transformed)

In [28]:
print("Precision: {:.2f}%".format(100 * precision_score(Y_strat_test_set, y_pred, pos_label='spam')))
print("Recall: {:.2f}%".format(100 * recall_score(Y_strat_test_set, y_pred, pos_label='spam')))

Precision: 97.61%
Recall: 96.58%
