In [1]:
!rm -fr temp && mkdir temp 
!wget -nv --directory-prefix=temp https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
!wget -nv --directory-prefix=temp https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
!cd temp && tar xjf 20021010_spam.tar.bz2 && tar xjf 20021010_easy_ham.tar.bz2

2020-10-10 17:51:13 URL:https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2 [1192582/1192582] -> "temp/20021010_spam.tar.bz2" [1]
2020-10-10 17:51:14 URL:https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2 [1677144/1677144] -> "temp/20021010_easy_ham.tar.bz2" [1]


In [1]:
!head temp/spam/0001.bfc8d64d12b325ff385cca8d07b84288

From 12a1mailbot1@web.de  Thu Aug 22 13:17:22 2002
Return-Path: <12a1mailbot1@web.de>
Delivered-To: zzzz@localhost.example.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.example.com (Postfix) with ESMTP id 136B943C32
	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]
	by localhost with POP3 (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received: from dd_it7 ([210.97.77.167])


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import email, pandas as pd, numpy as np
from os import listdir
from os.path import isfile, exists
import codecs, re

def read(filename):
    try:
        f = codecs.open(filename, encoding='utf-8', errors='strict')
        return f.read()        
    except UnicodeDecodeError:
        return ''                
    
def read_input_data(pairs):
    contents, labels = [], []
    for folder, label in pairs:
        for f in listdir(folder):
            c = read(folder + '/' + f)
            if c != '':
                contents.append(c)
                labels.append(label)
    result, y = pd.DataFrame(), pd.DataFrame()
    result['content'] = contents
    y['label'] = labels
    return (result, y)

class FilterBodyOnly(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def _recurse_concat(a):
        b = a.get_payload()
        if isinstance(b, list):
            result = ''
            for part in b:
                result += FilterBodyOnly._recurse_concat(part)
        else:
            result = b
        return result
    
    def _get_body(X):
        b = email.message_from_string(X)
        return FilterBodyOnly._recurse_concat(b)
                
    def transform(self, X, y=None):
        X['content'] = X['content'].apply(FilterBodyOnly._get_body)
        return X
    
class EliminateUnusablewords(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def _remove_HTML_tags(text):
        return re.sub('<[^<]+>', " ", text)
    
    def _replace_special_chars(text):
        return text.replace('\t', ' ').\
                    replace('\n', ' ').\
                    replace('.', ' ').\
                    replace(',', ' ')
        
    def transform(self, X, y=None):
        X['content'] = X['content'].apply(EliminateUnusablewords._remove_HTML_tags)
        X['content'] = X['content'].apply(EliminateUnusablewords._replace_special_chars)
        return X
    
class SplitWords(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.words_dict={}

    def _build_dictionary(self, X):
        index=0
        for row in X['content']:
            for word in row.split(' '):
                if word != '' and word not in self.words_dict:
                        self.words_dict[word] = index
                        index += 1
    
    def fit(self, X, y=None):
        self._build_dictionary(X)
        return self
            
    def transform(self, X, y=None):
        self._build_dictionary(X)
        result = np.zeros((X['content'].shape[0], (len(self.words_dict))))        
        for i, row in enumerate(X['content']):
            for word in row.split(' '):
                if word != '':
                    result[i, self.words_dict[word]] += 1
        return result

In [3]:
from sklearn.pipeline import Pipeline

X, y = read_input_data(pairs=[('temp/spam', 1), 
                              ('temp/easy_ham', 0)
                             ])

pipeline = Pipeline([
    ('filter_body', FilterBodyOnly()),
    ('eliminate_unusable_words', EliminateUnusablewords()),
    ('split_words', SplitWords()),
])

X = pipeline.fit_transform(X, y)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [6]:
sgd_clf.score(X_train, y_train)

0.9994720168954594

In [9]:
# recall & precision
from sklearn.metrics import precision_score, recall_score

print('Precision = True_Positives / (True_Positives+False_Positives) = ', end='')
print(precision_score(y_train, sgd_clf.predict(X_train)))

print('Recall = True_Positives / (True_Positives+False_Negatives) = ', end='')
print(recall_score(y_train, sgd_clf.predict(X_train)))

Precision = True_Positives / (True_Positives+False_Positives) = 1.0
Recall = True_Positives / (True_Positives+False_Negatives) = 0.996415770609319


In [10]:
print('Precision = True_Positives / (True_Positives+False_Positives) = ', end='')
print(precision_score(y_test, sgd_clf.predict(X_test)))

print('Recall = True_Positives / (True_Positives+False_Negatives) = ', end='')
print(recall_score(y_test, sgd_clf.predict(X_test)))

Precision = True_Positives / (True_Positives+False_Positives) = 0.968
Recall = True_Positives / (True_Positives+False_Negatives) = 0.8705035971223022
