This is my solution to an exercise to make a spam classifier using Apache SpamAssassin’s public datasets, which I am writing to learn about machine learning. The exercise is from 
_Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition_, by
Aurélien Géron.

Copyright (C) 2022 Chris March <https://github.com/chrismarch>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Contents
- [Load and Preprocess](#Load-and-Preprocess)
- [Validate](#Validate)
- [Train](#Train)
- [Test](#Test)

## Load and Preprocess

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import re
import email
from email import policy
from io import StringIO
import joblib

In [2]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    ''' MLStripper by "Olivier Le Floch" https://stackoverflow.com/a/925630 '''
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self): 
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
def email_get_body(b):
    ''' 
    email_get_body by Todor Minikov https://stackoverflow.com/a/32840516
    (this seems to be more robust, or at least easier to use without errors than email.Parser.get_body)
    '''
    body = ""
    if b.is_multipart():
        for part in b.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            # skip any text/plain (txt) attachments
            if ctype == 'text/plain' and 'attachment' not in cdispo:
                body = part.get_payload(decode=True)  # decode
                break
    # not multipart - i.e. plain text, no attachments, keeping fingers crossed
    else:
        body = b.get_payload(decode=True)

    return body

In [133]:
spam_key = 'is_spam'

def load_email_files_to_dataframe():
    vocabulary = {}
    too_common_words = {}
    file_paths_to_mail_dicts = {}
    file_index = 0
    #result = pd.DataFrame(data=file_paths_to_mail_dicts.values(), columns=columns, dtype=pd.SparseDtype(pd.UInt8Dtype())) # has to be nan to start, fill_value=0))     
    #df_type = 'Sparse[int]';#pd.SparseDtype(pd.UInt8Dtype())
    #result = pd.DataFrame(dtype=df_type) # has to be nan to start, fill_value=0))     

    for parent_dir, subdirs, files in os.walk('.'):
        #print(parent_dir)
        #print(subdirs)
        #print('---')
        html_regex = re.compile(r"(<html>|<HTML>)(.*)(<\/html>|<\/HTML>)", re.DOTALL)
        hamdir = '_ham' in parent_dir
        spamdir = 'spam' in parent_dir
        if hamdir or spamdir:
            for file in files:
                file_index += 1
                rel_file_path = os.path.join(parent_dir, file) 
                #print(rel_file_path)
                with open(rel_file_path, 'r', encoding='iso-8859-1') as f:
                #with open(rel_file_path, 'rb') as f:
                    f_str = f.read()   
                    msg = email.message_from_string(f_str, policy=policy.default)
                    #msg = email.parser.BytesParser(policy=policy.default).parse(f)
                    if file_index % 1000 == 0:
                        print(str(file_index))
                    subject = msg['subject']
                    #print(subject)
                    #print(msg['header'])
                    #body = msg.get_body(preferencelist=('html', 'plain'))
                    #if not body:
                    #    continue
                    #body = body.get_content()
                    body = str(email_get_body(msg))
                    #print(body)
                    html_match = html_regex.search(body)
                    #print(html_match)
                    if html_match:
                        body = html_match.group(2)
                        #print('------------')
                        #print(body)
                        #print('------------')
                    body_strip = strip_tags(body)
                    #body_strip = re.sub(r"[()\"\'-]", '', body_strip)
                    body_strip = re.sub(r"\\n", ' ', body_strip)
                    #print(body_strip)
                    mail_tokens = body_strip.split()
                    if subject:
                        mail_tokens += subject.split()
                    mail_dict = {}
                    mail_rejected_tokens = []
                    for token in mail_tokens:
                        if token in mail_dict:
                            mail_dict[token] = mail_dict[token] + 1
                            too_common_words[token] = 0
                            mail_dict.pop(token)
                            vocabulary.pop(token)
                        elif token not in too_common_words:
                            mail_dict[token] = 1
                            vocabulary[token] = 0
                    mail_dict[spam_key] = 1 if 'spam' in rel_file_path else 0
                    file_paths_to_mail_dicts[rel_file_path] = mail_dict
                    #mail_df = pd.DataFrame.from_dict(mail_dict, dtype=df_type)
                    #mail_df = pd.DataFrame(data=[mail_dict], dtype=df_type)
                    #result = pd.concat([result, mail_df])
                    #print(mail_tokens)                            
                    #print(mail_dict)
                    #print(mail_rejected_tokens)
                #if file_index > 100:
                #break
    print('finished loading emails')
    
    columns = [target_label_col] + list(vocabulary.keys())
    
    '''
    # for each mail, encode
    for file_path in file_paths_to_mail_dicts:
        mail_dict = file_paths_to_mail_dicts[file_path]
        for word in mail_dict:
            n_word = mail_dict[word]
            if n_word > 255:
                #print('!!!!!!', word)
                mail_dict.remove(word)
        mail_dicts.append(mail_dict)
    '''
    #mail_dicts = []
    #for path in file_paths_to_mail_dicts:
    #    mail_dicts.append(file_paths_to_mail_dicts[path])
    #print('finished constructing rows')
    #df_type = 'Sparse[int]'#pd.SparseDtype(pd.UInt8Dtype())
    #return pd.DataFrame.from_dict(row_dict, orient='index',dtype=pd.SparseDtype(np.uint8))
    # TODO convert to zero fill sparse from nan sparse and serialize zero fill sparse
    #nan_df = pd.DataFrame(data=mail_dicts, columns=columns, dtype=pd.SparseDtype(np.uint8)) # has to be nan to start, fill_value=0))
    #return pd.DataFrame(data=nan_df, dtype=pd.SparseDtype(np.uint8, fill_value=0))
    #return pd.DataFrame(data=list(file_paths_to_mail_dicts.values()), columns=columns, dtype='Sparse[float]')
        #"    nan_df = pd.DataFrame(data=mail_dicts, columns=columns, dtype=pd.SparseDtype(np.dtype('float64'))) # has to be nan to start, fill_value=0))\n",

    #return pd.DataFrame(data=mail_dicts, columns=columns, dtype=pd.SparseDtype(np.dtype('float64')))
    return pd.DataFrame(data=file_paths_to_mail_dicts.values(), columns=columns, dtype='Sparse[int]')#pd.SparseDtype(pd.UInt8Dtype()))

In [134]:
# warning: this can take a couple/few minutes    
all_data = load_email_files_to_dataframe()
print(all_data.sparse.density)
print(all_data.info())

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
finished loading emails
0.0003544377510030051
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Columns: 131171 entries, is_spam to https://listman.redhat.com/
dtypes: Sparse[UInt8, <NA>](131171)
memory usage: 2.9 MB
None


In [135]:
joblib.dump(all_data, "spamham.pkl")

['spamham.pkl']

In [132]:
#all_data = None
all_data = joblib.load("spamham.pkl")
print(all_data.info())
#all_data = pd.DataFrame(data=nan_data, dtype=pd.SparseDtype(np.dtype('float64'), fill_value=0))
#nan_data = None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Columns: 131170 entries, b"Friend,Now to https://listman.redhat.com/
dtypes: Sparse[UInt8, <NA>](131170)
memory usage: 2.8 MB
None


In [138]:
print(target_label_col in all_data)
print(all_data.info())

True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Columns: 131171 entries, is_spam to https://listman.redhat.com/
dtypes: Sparse[UInt8, <NA>](131171)
memory usage: 2.9 MB
None


In [146]:
all_data = pd.DataFrame(data=all_data, dtype='Sparse[int]')
joblib.dump(all_data, "spamham.pkl")
print(all_dense.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10751 entries, 0 to 10750
Columns: 131171 entries, is_spam to https://listman.redhat.com/
dtypes: Sparse[int64, 0](131171)
memory usage: 5.7 MB
None


In [136]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [209]:
print(all_data.sparse.density)
target_label_col = spam_key
y_all = all_data[target_label_col]
for train_indexes, test_indexes in split.split(all_data, y_all):
    strat_train_set = all_data.loc[train_indexes]
    strat_test_set = all_data.loc[test_indexes]

#print(list(strat_train_set))
X_train = strat_train_set.drop(target_label_col, axis=1)
y_train = strat_train_set[target_label_col].copy()
strat_train_set = None

# test data split from train.csv, since test.csv has no labels
X_test = strat_test_set.drop(target_label_col, axis=1)
y_test = strat_test_set[target_label_col].copy()
strat_test_set = None

0.0003544377510030051


In [210]:
print(X_train.info())
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8600 entries, 20 to 10180
Columns: 131170 entries, b"Friend,Now to https://listman.redhat.com/
dtypes: Sparse[int64, 0](131170)
memory usage: 4.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2151 entries, 2881 to 4362
Columns: 131170 entries, b"Friend,Now to https://listman.redhat.com/
dtypes: Sparse[int64, 0](131170)
memory usage: 1.1 MB
None


In [None]:
import sys

def get_largest_user_globals():
    ''' Adapted from https://stackoverflow.com/a/40997868 by Abdou '''
    
    # These are the usual ipython objects, including this one you are creating
    ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

    # Get a sorted list of the objects and their sizes
    sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

## Validate

In [148]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

linsvc_clf = LinearSVC(C=.1)
mnb_clf = MultinomialNB()                       

In [74]:
def display_scores(estimator, scores):
    print("\n")
    print(type(estimator).__name__)
    #print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
        
def display_estimator_cv_scores(estimator, X, y): 
    scores = cross_val_score(estimator, X, y, cv=5)
    display_scores(estimator, scores)

def print_cv_scores(estimators, X, y):
    scores_and_estimators = []
    for e in estimators:
        scores = cross_val_score(e, X, y, cv=5)
        scores_and_estimators.append((scores.mean(), scores, e))
        
    scores_and_estimators.sort(key = lambda x: x[0], reverse=True)
    for mean, scores, e in scores_and_estimators:
        display_scores(e, scores)
 
'''
words only, no replacement
MultinomialNB
Mean: 0.9759302325581395
Standard deviation: 0.0023139242723409837

unfiltered tokens (no headers or html tags, #body_strip = re.sub(r"[()\"\'-]", '', body_strip))
MultinomialNB
Mean: 0.9853488372093022
Standard deviation: 0.002156655464068763

unfiltered tokens (no headers or html tags,
MultinomialNB
Mean: 0.9856976744186046
Standard deviation: 0.0022845212446963464
'''

estimators = [mnb_clf]#, linsvc_clf]
print_cv_scores(estimators, X_train, y_train)



MultinomialNB
Mean: 0.9856976744186046
Standard deviation: 0.0022845212446963464


## Train

In [218]:
import re
import string

class VocabSimplifier(BaseEstimator, TransformerMixin):
    
    def __init__(self, to_lower=False, strip_punctuation=False, replace_urls=True, replace_numbers=False): # no *args or **kargs
        self.to_lower = to_lower
        self.strip_punctuation = strip_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
                
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def apply_simplifications(self, word):
        if (word == spam_key):
            return word
        
        if (self.to_lower):
            word = word.lower()
            
        if (self.strip_punctuation):
            word = word.translate(str.maketrans('', '', string.punctuation))
            
        if (self.replace_urls):
            word = re.sub(r'http\S+', '_URL_', word)
            
        if (self.replace_numbers):
            word = re.sub(r'[0-9]+', '_N_', word)
        return word
            
    def transform(self, X : pd.DataFrame):
        #print("VocabSimplifier.transform in.shape:", X)
        X = X.copy()
        cols_to_drop = []
        simplified_vocab = {}
        vocab = list(X)
        n_vocab = len(vocab)
        progress_size = n_vocab/10
        for i, word in enumerate(vocab):
            if i % progress_size == 0:
                print(i, '/', n_vocab)
#            print(word)
            w = self.apply_simplifications(word)
#            print(i, w, word)
            if not w in simplified_vocab:
                simplified_vocab[w] = [word]
            else:
                #w_i = int(simplified_vocab[w])
#                print(("COLUMN DEL", w, i, word))
                simplified_vocab[w].append(word)
                cols_to_drop.append(word)

        print(n_vocab, '/', n_vocab)

        for similar_words in simplified_vocab.values():      
            if len(similar_words) > 1:
                main_col_for_w = similar_words[0]        
                #print(main_col_for_w)
                sparse_type = X.dtypes[main_col_for_w]
                '''
                print(X.dtypes[main_col_for_w])
                X[main_col_for_w] = X[main_col_for_w].sparse.to_dense()
                print(X.dtypes[main_col_for_w])
                for word in similar_words:
                    if word != main_col_for_w:
                        print(X.dtypes[main_col_for_w])
                        X[main_col_for_w] += X[word].sparse.to_dense()                        
                '''
                X[main_col_for_w] = X[similar_words].sum(axis=1).astype(sparse_type)

        #print("VocabSimplifier.transform cols_to_drop:", cols_to_drop)
        print('VocabSimplifier dropping columns')
        X.drop(columns=cols_to_drop, inplace=True)
        #print("VocabSimplifier.transform out.shape:", X.info())
        return X

hyperparameters to your preparation pipeline to control whether or not to 
- (strip off email headers)
- convert each email to lowercase, 
- remove punctuation, 
- replace all URLs with “URL,” 
- replace all numbers with “NUMBER,” 
- (or even perform stemming (i.e., trim off word endings; there are Python libraries available to do this).)

Finally, try out several classifiers and see if you can build a great spam classifier, with both high recall and high precision

In [219]:
#word_regex = re.compile(r"^[(]?((mp3|MP3)[sS]?|[a-zA-Z]+|[a-zA-Z]+[-\/]?[a-zA-Z]+|[a-zA-Z]+[-\/]?[a-zA-Z]+[']?[a-zA-Z]+)([)]?[!?]+|[!?]+[)]?|[.,;:)]?)$")

mnb_param_grid = [
    {'alpha': [0.0001, .5, 1], 'fit_prior': [True, False] }
  ]

grid_search = GridSearchCV(mnb_clf, mnb_param_grid, cv=5,
                           #return_train_score=True, 
                           verbose=2)

#col_trans.named_transformers_["cat"].handle_unknown = 'ignore' # for dropping attributes

main_pipeline = Pipeline([
    ('simple', VocabSimplifier()),
 #   ('dropper', AttributesDropper()),
    #('mnb', MultinomialNB()),
    ('grid', grid_search)
])

param_main = [
    {'simple__to_lower': [False, True]},
    {'simple__strip_punctuation': [False, True]},
    {'simple__replace_urls': [False, True]},
    {'simple__replace_numbers': [False, True]}
  ]

full_pipeline = GridSearchCV(main_pipeline, param_main, cv=5,
                           verbose=2)#, error_score='raise')

In [None]:
pipe_out = main_pipeline.fit(X_train, y_train)
print(pipe_out.best_estimator_.steps[1][1].best_params_)

0 / 131170
13117 / 131170
26234 / 131170
39351 / 131170
52468 / 131170
65585 / 131170
78702 / 131170
91819 / 131170
104936 / 131170
118053 / 131170
131170 / 131170
VocabSimplifier dropping columns
Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
pipe_out = full_pipeline.fit(X_train, y_train)
#pipe_out = main_pipeline.fit(X_train, y_train)
#print(grid_search.best_params_)
print(full_pipeline.best_params_)
print(full_pipeline.best_estimator_.steps[2][1].best_params_)

## Test

In [214]:
from sklearn.metrics import precision_score, recall_score

#X_test_tr = col_trans.fit_transform(X_test)
#print(list(X_train))
#print(list(X_test))

# forest
# 0.785314498933902
# 0.7706711343254163 drop age

# knn
# 0.7795566502463054

# sgd
# 0.777129750982962

y_test_predict = pipe_out.predict(X_test)
print(precision_score(y_test, y_test_predict)) # first priority for spam classifier
print(recall_score(y_test, y_test_predict))    # second priority
score = f1_score(y_test, y_test_predict, average="macro")
score

0 / 131170
13117 / 131170
26234 / 131170
39351 / 131170
52468 / 131170
65585 / 131170
78702 / 131170
91819 / 131170
104936 / 131170
118053 / 131170
http://inglesa.net/unsub.php?client=atomicDOT
(http://admanmail.com/subscription.asp?em=JM@NETNOTEINC.COM&l=SGO)
window.open("http://www.ouweilighting.com");
"http://www.radisson-chicago.com"
>http://www.thaiworkathome.com/unsubscribe.php
url(http://images.lockergnome.com/images/issue/top-right.gif);
\thttp://use.perl.org/my/messages/
b'http://www.rebackee.com/cursos2/contraloria.htm
Websie:http://www.wjjzzs.com\r
open("http://www.pointcom.com","_top");}
b"http://www.nme.com/news/102774.htm
\'http0:python\'
>>http://www.frogstone.net/Cafe/CafeForteana.html
[1]http://www.theperlreview.com
0.9739921976592978
0.9855263157894737


0.9842730411939666