# Chapter 3 &ndash; Classification; Exercise 4 &ndash; Spam classifier 

In [14]:
import numpy as np
import os
import re
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# to make this notebook's output stable across runs
np.random.seed(42)

In [15]:
import email
import email.policy
import tarfile
import requests

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = "datasets"

def fetch_data():
    if not os.path.isdir(SPAM_PATH):
        os.makedirs(SPAM_PATH)
    for filename, url in [('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)]:
        r = requests.get(url)
        filename = os.path.join(SPAM_PATH, filename)
        with open(filename, 'wb') as f:
            f.write(r.content)
        with tarfile.open(filename) as tf:
            tf.extractall(path=SPAM_PATH)
            
def load_email(filename):
    with open(filename, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
            
def load_data():
    HAM_DIR, SPAM_DIR = [os.path.join(SPAM_PATH, dirname) for dirname in ('easy_ham', 'spam')]
    if not all([os.path.isdir(dirname) for dirname in [HAM_DIR, SPAM_DIR]]):
        print("Fetching data.")
        fetch_data()
    ham_filenames, spam_filenames = [
        [os.path.join(dirname, name) for name in sorted(os.listdir(dirname)) if len(name) > 20]
        for dirname in [HAM_DIR, SPAM_DIR]]
    return tuple([load_email(fn) for fn in fn_list] for fn_list in (ham_filenames, spam_filenames))

In [16]:
ham_emails, spam_emails = load_data()

In [17]:
from sklearn.model_selection import train_test_split

def my_train_test_split(mails):
    return train_test_split(mails, train_size=0.75, test_size=0.25, random_state=42)
ham_train, ham_test = my_train_test_split(ham_emails)
spam_train, spam_test = my_train_test_split(spam_emails)

X_train = np.array(ham_train + spam_train)
X_test = np.array(ham_test + spam_test)
y_train = np.array([0] * len(ham_train) + [1] * len(spam_train)) 
y_test = np.array([0] * len(ham_test) + [1] * len(spam_test))

np.random.seed(42)
train_shuffle_idxs = np.random.permutation(len(X_train))
test_shuffle_idxs = np.random.permutation(len(X_test))
X_train = X_train[train_shuffle_idxs]
y_train = y_train[train_shuffle_idxs]
X_test = X_test[test_shuffle_idxs]
y_test = y_test[test_shuffle_idxs]

## Explore the data

In [18]:
from collections import Counter
from pprint import pprint

def count_content_types(mails):
    types = Counter()
    for m in mails:
        ct = m.get_content_type()
        types[ct] += 1
    return types

print('Most common content_types in Ham:')
pprint(count_content_types(ham_train).most_common(5))
print('\nMost common content_types in Spam:')
pprint(count_content_types(spam_train).most_common(5))

Most common content_types in Ham:
[('text/plain', 1803),
 ('multipart/signed', 55),
 ('multipart/mixed', 7),
 ('multipart/alternative', 7),
 ('multipart/report', 2)]

Most common content_types in Spam:
[('text/plain', 158),
 ('text/html', 139),
 ('multipart/alternative', 38),
 ('multipart/mixed', 33),
 ('multipart/related', 7)]


### Show an example HTML email

In [19]:
from bs4 import BeautifulSoup
html_mails = [mail for mail in X_train if mail.get_content_subtype() == 'html']
hm = html_mails[5].get_content()
bs = BeautifulSoup(hm, 'html.parser')
print(' '.join(list(bs.stripped_strings)))

Lowest Rate Services Conferencing Made Easy Only 18 Cents Per Minute! (Including Long Distance!) No setup fees No contracts or monthly fees Call anytime, from anywhere, to anywhere Connects up to 100 Participants Simplicity in set up and administration Operator Help available 24/7 The Highest Quality Service For The Lowest Rate In The Industry! Fill out the form below to find out how you can lower your phone bill every month. Required Input Field * Name * Web 
            Address Company 
            Name * State * Business 
            Phone * Home 
            Phone Email 
            Address * Type of 
            Business To be removed from this list, send an e-mail to remove@b2b-mail.net Type the word "remove" in the subject line. .


## Prepare the data

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline

class ContentTypeFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.classes_ = ['PLAIN', 'HTML', 'SIGNED', 'OTHER']

    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        def to_class(mail):
            ct = mail.get_content_type()
            if ct == 'text/plain':
                return self.classes_[0]
            elif ct == 'text/html':
                return self.classes_[1]
            elif ct == 'multipart/signed':
                return self.classes_[2]
            else:
                return self.classes_[3]
        return np.vectorize(to_class)(x)

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.le_ = LabelEncoder()

    def fit(self, x, y=None):
        self.le_.fit(x)
        return self
    
    def transform(self, x):
        return self.le_.transform(x)
    
def twod_reshape(X):
    return X.reshape(-1, 1)

# We can not use lambda, because lambdas are not pickeable
one_to_two_reshaper = FunctionTransformer(twod_reshape, validate=False)

content_type_pipeline = make_pipeline(ContentTypeFeaturesTransformer(),
                                      CategoricalEncoder(),
                                      one_to_two_reshaper,
                                      OneHotEncoder()) 

In [21]:
def get_domain(addr):
    m = re.match('.*@(.*)', addr)
    if m:
        return m.group(1)
    return ''

class HeaderFeaturesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        def get_features(mail):
            reply_to = mail.get('Reply-To')
            has_reply_to = 1.0 if reply_to else 0.0
            from_h = mail.get('From')
            reply_to_diff_domain = 0.0
            if has_reply_to and from_h:
                reply_to_diff_domain = 1.0 if get_domain(reply_to) != get_domain(from_h) else 0.0
            has_unsubscribe = 1.0 if mail.get('List-Unsubscribe') else 0.0
            return (has_reply_to, reply_to_diff_domain, has_unsubscribe)
        return np.vstack(np.vectorize(get_features)(x)).T

In [22]:
from scipy.sparse import dok_matrix
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def parse_html(body):
    return ' '.join(BeautifulSoup(hm, 'lxml').stripped_strings)

def get_content_with_type(mail, include_subject=True):
    subject = ''
    if include_subject:
        subject = ' ' + mail.get('subject')
        
    body = mail
    if mail.is_multipart():
        body = mail.get_body(('html', 'plain'))
        
    try:
        if body.get_content_subtype() == 'html':
            return ('html', parse_html(body.get_content()) + subject)
        elif body.get_content_subtype() == 'plain':
            return ('plain', body.get_content() + subject)
        else:
            return ('other', subject)
    except:
        return ('other', subject)
    
def remove_non_alpha(s):
    return re.sub('[^a-zA-Z]', '', s)

def is_url(s):
    return re.match('^http.?://', s)

def is_number(s):
    return re.match('^[0-9]+$', s)
    
class MailToWordCount(BaseEstimator, TransformerMixin):
    def __init__(self, top_word_count=1000, to_lower=True, parse_url=True, parse_number=True, stem=True,
                include_subject=True, per_mail_count=False, only_has_a=False, not_in_dict_col=False):
        self.top_word_count = top_word_count
        self.to_lower = to_lower
        self.parse_url = parse_url
        self.parse_number = parse_number
        self.stem = stem
        self.include_subject = include_subject
        self.per_mail_count = per_mail_count
        self.only_has_a = only_has_a
        self.not_in_dict_col = not_in_dict_col
        self.word_counter_ = Counter()
        self.mail_counter_ = Counter()

    def fit(self, X, y=None):
        for mail in X:
            ct, content = get_content_with_type(mail, self.include_subject)
            mail_words = set()
            for word in content.split():
                word = self._process_word(word)
                if len(word):
                    self.word_counter_[word] += 1
                    if word not in mail_words:
                        self.mail_counter_[word] += 1
                        mail_words.add(word)
        counter = self.mail_counter_ if self.per_mail_count else self.word_counter_
        self.top_words_dict_ = dict([(w[0], i) for i, w in enumerate(counter.most_common(self.top_word_count))])
        return self
    
    def transform(self, X):
        not_in_dict_shift = 1 if self.not_in_dict_col else 0
        feature_count = self.top_word_count + not_in_dict_shift
        features = dok_matrix((len(X), feature_count))
        for i, mail in enumerate(X):
            ct, content = get_content_with_type(mail, self.include_subject)
            for word in content.split():
                word = self._process_word(word)
                if word in self.top_words_dict_:
                    col = self.top_words_dict_[word] + not_in_dict_shift
                    if self.only_has_a:
                        features[i, col] = 1
                    else:
                        features[i, col] += 1
                else:
                    if self.only_has_a:
                        features[i, 0] = 1
                    else:
                        features[i, 0] += 1
        return features
        
    
    def _process_word(self, word):
        if self.parse_url and is_url(word):
            return '_URL'
        elif self.parse_number and is_number(word):
            return '_NUMBER'
        else:
            return self._alphaword(word)

    def _alphaword(self, word):
        word = remove_non_alpha(word)
        if self.to_lower:
            word = word.lower()
        if self.stem:
            word = stemmer.stem(word)
        return word

In [23]:
import sklearn.pipeline as pipeline

preprocess_pipeline = pipeline.FeatureUnion([
    ('content_type', content_type_pipeline),
    ('header', HeaderFeaturesTransformer()),
    ('word_count', MailToWordCount()),
])

## Train some initial models

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,  RandomizedSearchCV

grid_params = {
    'prep__word_count__stem': [False, True],
    'prep__word_count__top_word_count': [10, 100, 1000, 10000],
    'prep__word_count__parse_url': [False, True],
    'prep__word_count__parse_number': [False, True],
    'prep__word_count__per_mail_count': [False, True],
    'prep__word_count__to_lower': [False, True],
    'prep__word_count__include_subject': [False, True],
    'prep__word_count__not_in_dict_col': [False, True],
    'prep__word_count__only_has_a': [False, True],
}

lr_pipeline = pipeline.Pipeline([
    ('prep', preprocess_pipeline),
    ('lr', LogisticRegression()),
])
rs_clf = RandomizedSearchCV(lr_pipeline, n_iter=32, param_distributions=grid_params,
                            verbose=True, n_jobs=-1, random_state=42,
                            return_train_score=False)

In [25]:
rs_clf.fit(X_train[:1024], y_train[:1024])

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 10.3min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('prep', FeatureUnion(n_jobs=1,
       transformer_list=[('content_type', Pipeline(memory=None,
     steps=[('contenttypefeaturestransformer', ContentTypeFeaturesTransformer()), ('categoricalencoder', CategoricalEncoder()), ('functiontransformer', FunctionTransformer(accept_sparse=False,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=32, n_jobs=-1,
          param_distributions={'prep__word_count__include_subject': [False, True], 'prep__word_count__only_has_a': [False, True], 'prep__word_count__parse_number': [False, True], 'prep__word_count__not_in_dict_col': [False, True], 'prep__word_count__parse_url': [False, True], 'prep__word_count__per_mail_count': [False, True], 'prep__word_count__top_word_count': [10, 100, 1000, 10000], 'prep__word_count__to_lower': [False, True], 'prep_

In [26]:
print('Display mean test score grouped by each parameter.')
cv_results = pd.DataFrame.from_dict(rs_clf.cv_results_)
for param in grid_params.keys():
    display(cv_results.groupby('param_' + param)['mean_test_score'].mean())

Display mean test score grouped by each parameter.


param_prep__word_count__include_subject
False    0.961487
True     0.975525
Name: mean_test_score, dtype: float64

param_prep__word_count__only_has_a
False    0.968904
True     0.967924
Name: mean_test_score, dtype: float64

param_prep__word_count__parse_number
False    0.964600
True     0.972412
Name: mean_test_score, dtype: float64

param_prep__word_count__not_in_dict_col
False    0.971625
True     0.964495
Name: mean_test_score, dtype: float64

param_prep__word_count__parse_url
False    0.969727
True     0.967671
Name: mean_test_score, dtype: float64

param_prep__word_count__per_mail_count
False    0.971875
True     0.962891
Name: mean_test_score, dtype: float64

param_prep__word_count__top_word_count
10       0.941546
100      0.972363
1000     0.980835
10000    0.975865
Name: mean_test_score, dtype: float64

param_prep__word_count__to_lower
False    0.966271
True     0.970035
Name: mean_test_score, dtype: float64

param_prep__word_count__stem
False    0.970508
True     0.966739
Name: mean_test_score, dtype: float64

Looks like `~parse_url`, `include_subject`, `~only_has_a`, `stem`, `to_lower`, `top_word_count=1000`, `not_in_dict_col` parameters are the best. Let's try grid search to analyze the other parameters.

In [28]:
lr_pipeline.set_params(
    prep__word_count__parse_url=False,
    prep__word_count__include_subject=True,
    prep__word_count__only_has_a=False,
    prep__word_count__stem=True,
    prep__word_count__to_lower=True,
    prep__word_count__top_word_count=1000,
    prep__word_count__not_in_dict_col=True,
)

grid_params = {
    'prep__word_count__parse_number': [False, True],
    'prep__word_count__per_mail_count': [False, True],
}

gs_clf = GridSearchCV(lr_pipeline, param_grid=grid_params,
                    verbose=True, n_jobs=-1, return_train_score=False)
gs_clf.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('prep', FeatureUnion(n_jobs=1,
       transformer_list=[('content_type', Pipeline(memory=None,
     steps=[('contenttypefeaturestransformer', ContentTypeFeaturesTransformer()), ('categoricalencoder', CategoricalEncoder()), ('functiontransformer', FunctionTransformer(accept_sparse=False,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'prep__word_count__per_mail_count': [False, True], 'prep__word_count__parse_number': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=True)

In [29]:
print('Display mean test score grouped by each parameter.')
cv_results = pd.DataFrame.from_dict(gs_clf.cv_results_)
for param in grid_params.keys():
    display(cv_results.groupby('param_' + param)['mean_test_score'].mean())

Display mean test score grouped by each parameter.


param_prep__word_count__per_mail_count
False    0.986222
True     0.984667
Name: mean_test_score, dtype: float64

param_prep__word_count__parse_number
False    0.985556
True     0.985333
Name: mean_test_score, dtype: float64

So `~per_mail_count` and `parse_number`. And that gives us ~98.8% accuracy.

In [30]:
lr_pipeline.set_params(
    prep__word_count__parse_number=False,
    prep__word_count__per_mail_count=True,
)
lr_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('prep', FeatureUnion(n_jobs=1,
       transformer_list=[('content_type', Pipeline(memory=None,
     steps=[('contenttypefeaturestransformer', ContentTypeFeaturesTransformer()), ('categoricalencoder', CategoricalEncoder()), ('functiontransformer', FunctionTransformer(accept_sparse=False,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

## Get the generalization accuracy

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, lr_pipeline.predict(X_test))

0.9826666666666667

98.3% accuracy on the test set. Not bad.