This is my solution to an exercise to make a spam classifier using Apache SpamAssassin’s public datasets, which I am writing to learn about machine learning. The exercise is from 
_Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition_, by
Aurélien Géron.

Copyright (C) 2022 Chris March <https://github.com/chrismarch>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Contents
- [Load and Preprocess](#Load-and-Preprocess)
- [Validate](#Validate)
- [Train](#Train)
- [Test](#Test)

## Load and Preprocess

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import re
import email
from email import policy
from io import StringIO
import joblib

In [2]:
from enum import IntFlag

class PreprocessFlags(IntFlag):
    TO_LOWER = 1
    STRIP_PUNCTUATION = 2
    REPLACE_URLS = 4
    REPLACE_NUMBERS = 8
    
    ALL_IMPLEMENTED_FLAGS = TO_LOWER | STRIP_PUNCTUATION | REPLACE_URLS | REPLACE_NUMBERS
    
    STRIP_HEADERS = 16     # TODO
    STEM = 32              # TODO

In [3]:
import re
import string

CATEGORY_LABEL = "is_spam"

def preprocess_token(token : str, flags : PreprocessFlags) -> str:
    
    if (token == CATEGORY_LABEL):
        return token

    if (PreprocessFlags.TO_LOWER in flags):
        token = token.lower()

    if (PreprocessFlags.REPLACE_URLS in flags):
        if re.match(r'http\S+', token):
            token = 'URLREPLACED'

    if (PreprocessFlags.STRIP_PUNCTUATION in flags):
        token = token.translate(str.maketrans('', '', string.punctuation))    

    if (PreprocessFlags.REPLACE_NUMBERS in flags):
        token = re.sub(r'[0-9]+', '_N_', token)

    return token

In [4]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    ''' MLStripper by "Olivier Le Floch" https://stackoverflow.com/a/925630 '''
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self): 
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [5]:
def email_get_body(b):
    ''' 
    email_get_body by Todor Minikov https://stackoverflow.com/a/32840516
    (this seems to be more robust, or at least easier to use without errors than email.Parser.get_body)
    '''
    body = ""
    if b.is_multipart():
        for part in b.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            # skip any text/plain (txt) attachments
            if ctype == 'text/plain' and 'attachment' not in cdispo:
                body = part.get_payload(decode=True)  # decode
                break
    # not multipart - i.e. plain text, no attachments, keeping fingers crossed
    else:
        body = b.get_payload(decode=True)

    return body

In [6]:
spam_key = 'is_spam'

def load_email_files_to_dataframe(flags : PreprocessFlags):
    vocabulary = {}
    mail_dicts = []
    file_index = 0

    for parent_dir, subdirs, files in os.walk('.'):
        #print(parent_dir)
        #print(subdirs)
        #print('---')
        html_regex = re.compile(r"(<html>|<HTML>)(.*)(<\/html>|<\/HTML>)", re.DOTALL)
        hamdir = '_ham' in parent_dir
        spamdir = 'spam' in parent_dir
        if hamdir or spamdir:
            for file in files:
                file_index += 1
                rel_file_path = os.path.join(parent_dir, file) 
                #print(rel_file_path)
                with open(rel_file_path, 'r', encoding='iso-8859-1') as f:
                #with open(rel_file_path, 'rb') as f:
                    f_str = f.read()   
                    msg = email.message_from_string(f_str, policy=policy.default)
                    #msg = email.parser.BytesParser(policy=policy.default).parse(f)
                    if file_index % 1000 == 0:
                        print(str(file_index))
                    subject = msg['subject']
                    #print(subject)
                    #print(msg['header'])
                    #body = msg.get_body(preferencelist=('html', 'plain'))
                    #if not body:
                    #    continue
                    #body = body.get_content()
                    body = str(email_get_body(msg))
                    #print(body)
                    html_match = html_regex.search(body)
                    #print(html_match)
                    if html_match:
                        body = html_match.group(2)
                        #print('------------')
                        #print(body)
                        #print('------------')
                    body_strip = strip_tags(body)
                    body_strip = re.sub(r"\\n", ' ', body_strip)
                    body_strip = re.sub(r"\\t", ' ', body_strip)
                    #print(body_strip)
                    mail_tokens = body_strip.split()
                    if subject:
                        mail_tokens += subject.split()
                    mail_dict = {}
                    mail_rejected_tokens = []
                    for token in mail_tokens:
                        token = preprocess_token(token, flags)
                        if token in mail_dict:
                            mail_dict[token] = mail_dict[token] + 1
                        else:
                            mail_dict[token] = 1
                            vocabulary[token] = 0
                    mail_dict[spam_key] = 1 if 'spam' in rel_file_path else 0
                    mail_dicts.append(mail_dict)
                #if file_index > 100:
                #break
    print('finished loading emails')
    
    columns = [CATEGORY_LABEL] + list(vocabulary.keys())
    del vocabulary
    
    # TODO find a one step way to zero (instead of nan) fill missing values in sparse DataFrames
    nan_df = pd.DataFrame(data=mail_dicts, columns=columns, dtype = pd.SparseDtype(pd.UInt32Dtype()))
    del mail_dicts
    
    return pd.DataFrame(data=nan_df, columns=columns, dtype = pd.SparseDtype(np.dtype('int32'), fill_value=0))

In [None]:
# warning: this can take a couple hours

# save out preprocessed binary dataframes, so that the time needed for preprocessing hyperparameters is done once
n_flag_combos = PreprocessFlags.ALL_IMPLEMENTED_FLAGS + 1
flags_range = range(n_flag_combos)
for flags in flags_range:
    if flags == 0:
        continue
    all_data = load_email_files_to_dataframe(PreprocessFlags(flags))
    print('---------------- Preprocess: saving ' + str(flags) + 'of [0, ' + str(n_flag_combos) + ']')
    print(all_data.sparse.density)
    print(all_data.info())
    joblib.dump(all_data, "spamham_flags" + str(flags) + ".pkl")
    del all_data

## Validate

In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

def display_scores(estimator, scores):
    print("\n")
    print(type(estimator).__name__)
    #print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def print_cv_scores(estimators, X, y):
    scores_and_estimators = []
    for e in estimators:
        scores = cross_val_score(e, X, y, cv=5, verbose=2)
        scores_and_estimators.append((scores.mean(), scores, e))

    scores_and_estimators.sort(key = lambda x: x[0], reverse=True)
    for mean, scores, e in scores_and_estimators:
        display_scores(e, scores)

def load_train_test_data(flags : PreprocessFlags) -> (str, str, str, str):
    ''' Returns X_train, y_train, X_test, y_test '''
    
    fname = "spamham_flags" + str(flags) +".pkl"
    print("loading " + fname)

    all_data = joblib.load(fname)
    print(all_data.info())
    print(all_data.sparse.density)

    #all_data = pd.DataFrame(data=nan_data, dtype=pd.SparseDtype(np.dtype('float64'), fill_value=0))
    #nan_data = None
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    y_all = all_data[CATEGORY_LABEL]
    for train_indexes, test_indexes in split.split(all_data, y_all):
        strat_train_set = all_data.loc[train_indexes]
        strat_test_set = all_data.loc[test_indexes]

    del all_data
    #print(list(strat_train_set))
    X_train = strat_train_set.drop(CATEGORY_LABEL, axis=1)
    y_train = strat_train_set[CATEGORY_LABEL].copy()
    del strat_train_set

    # test data split from train.csv, since test.csv has no labels
    X_test = strat_test_set.drop(CATEGORY_LABEL, axis=1)
    y_test = strat_test_set[CATEGORY_LABEL].copy()
    del strat_test_set
    
    print(fname)
    return (X_train, y_train, X_test, y_test)
        
def validate(preprocessor_flags_range):
    for flags in preprocessor_flags_range:
        X_train, y_train, X_test, y_test = load_train_test_data(flags)
        linsvc_clf = LinearSVC(C=.01)
        mnb_clf = MultinomialNB(alpha=1e-4, fit_prior=False)                       

        estimators = [mnb_clf]#, linsvc_clf]
        #estimators = [linsvc_clf]
        print_cv_scores(estimators, X_train, y_train)

In [None]:
validate(range(0,13))


MultinomialNB (words only, no replacement)  
Mean: 0.9759302325581395
Standard deviation: 0.0023139242723409837

LinearSVC (pp 9, c=.1)  
Mean: 0.9882558139534885
Standard deviation: 0.0020604703658917765

MultinomialNB (unfiltered tokens (no headers or html tags, #body_strip = re.sub(r"[()\"\'-]", '', body_strip)))  
Mean: 0.9853488372093022
Standard deviation: 0.002156655464068763


MultinomialNB (unfiltered tokens (no headers or html tags,). 
Mean: 0.9856976744186046
Standard deviation: 0.0022845212446963464

MultinomialNB (pp 0)  
Mean: 0.9909302325581395
Standard deviation: 0.0018964542360814004

MultinomialNB (pp 1)  
Mean: 0.9906976744186047
Standard deviation: 0.0014708268186829627

MultinomialNB (pp 2). 
Mean: 0.9894186046511628
Standard deviation: 0.0017787277372998196

MultinomialNB (3)  
Mean: 0.9890697674418604
Standard deviation: 0.001000270379888683

MultinomialNB (4)  
Mean: 0.9906976744186047
Standard deviation: 0.0015160935826052818

MultinomialNB (5)  
Mean: 0.9897674418604652
Standard deviation: 0.0016689186156287625

MultinomialNB (6)  
Mean: 0.9891860465116279
Standard deviation: 0.001786312964620599

MultinomialNB (7)  
Mean: 0.9877906976744185
Standard deviation: 0.0008222171874262352

MultinomialNB (8)  
Mean: 0.991046511627907
Standard deviation: 0.002313924272340988

MultinomialNB (9)  
Mean: **0.991046511627907**
Standard deviation: **0.0010783277320343995**

MultinomialNB (10)  
Mean: 0.9901162790697674
Standard deviation: 0.0016850438076964278

MultinomialNB (11)  
Mean: 0.9895348837209303
Standard deviation: 0.000636886694773442

MultinomialNB (12)  
Mean: 0.991046511627907
Standard deviation: 0.0019317729913762727

MultinomialNB (pp 13)  
Mean: 0.9901162790697675
Standard deviation: 0.0012195451722908676

MultinomialNB (pp 14)  
Mean: 0.9893023255813954
Standard deviation: 0.001748057718415442

MultinomialNB (preprocess 15)  
Mean: 0.9883720930232558
Standard deviation: 0.0011627906976744095

In [None]:
import sys

def get_largest_user_globals():
    ''' Returns global objects with largest memory footprint, attempting to isolate those allocated by user
        Adapted from https://stackoverflow.com/a/40997868 by Abdou '''
    
    # These are the usual ipython objects, including this one you are creating
    ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

    # Get a sorted list of the objects and their sizes
    sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

## Train

hyperparameters to your preparation pipeline to control whether or not to 
- (strip off email headers)
- convert each email to lowercase, 
- remove punctuation, 
- replace all URLs with “URL,” 
- replace all numbers with “NUMBER,” 
- (or even perform stemming (i.e., trim off word endings; there are Python libraries available to do this).)

Finally, try out several classifiers and see if you can build a great spam classifier, with both high recall and high precision

In [72]:
X_train, y_train, X_test, y_test = load_train_test_data((PreprocessFlags.TO_LOWER | 
                                                        PreprocessFlags.REPLACE_NUMBERS).value)

loading spamham_flags9.pkl
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Columns: 128305 entries, is_spam to https://listman.redhat.com/
dtypes: Sparse[int32, 0](128305)
memory usage: 13.1 MB
None
0.00124088027083492
spamham_flags9.pkl


In [90]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB

mnb_param_grid = [
    {'alpha': [1e-3, 1e-1,1e-2] },# 'fit_prior': [True, False] }
  ]

linsvc_param_grid = [
    {'C' : [.19, .18, .2], 'max_iter' : [2700,2750,2650]},
  ]

linsvc_clf = LinearSVC(C=.19, max_iter=2700)
mnb_clf = MultinomialNB(alpha=1e-2, fit_prior=False) 

grid_search = GridSearchCV(linsvc_clf, linsvc_param_grid, cv=5,
                           #return_train_score=True, 
                           verbose=2)

main_pipeline = Pipeline([
    #("tfidf", TfidfTransformer()),
    #('mnb', mnb_clf),
    #('bnb', BernoulliNB(alpha=1e-2, fit_prior=False))
    ('cnb', ComplementNB(alpha=1e-2))#, fit_prior=False))
#    ('linsvc', linsvc_clf),
#    ('grid', grid_search)
])

In [91]:
pipe_out = main_pipeline.fit(X_train, y_train)
#print(grid_search.best_params_)

## Test

In [92]:
from sklearn.metrics import precision_score, recall_score

#X_test_tr = col_trans.fit_transform(X_test)
#print(list(X_train))
#print(list(X_test))

y_test_predict = pipe_out.predict(X_test)
print(precision_score(y_test, y_test_predict)) # first priority for spam classifier
print(recall_score(y_test, y_test_predict))    # second priority
score = f1_score(y_test, y_test_predict, average="macro")
score

0.9921363040629095
0.9960526315789474


0.9954260178049563

### Results (F1)

#### MNB/CNB
- 0.9842730411939666 url replace
- **0.9954260178049563** (alpha=1e-2, fit_prior=False, preprocessor 9: num. replace, to lower)
- 0.9934009451494716 (tfidf, alpha=1e-2, fit_prior=False, pp 9, )

#### BNB
- 0.9949193040934678 (alpha=1e-2, fit_prior=False, pp 9)

#### LinSVC
- 0.9913856161203258 (C=.19, max_iter=2700)
- 0.9923721619016614 (tfidf, C=.19, max_iter=2700)