### Loading dataset

In [10]:
import os

HAM_DIR = "easy_ham"
SPAM_DIR = "spam_2"
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name)>20]


In [11]:
print(len(ham_filenames), len( spam_filenames))

2551 1396


In [26]:
# python email module to parse the emails
import email
import email.policy

def load_email(directory, filename, spam_path=SPAM_PATH):
    with open(os.path.join(directory,filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [27]:
ham_emails = [load_email("easy_ham", name) for name in ham_filenames ]
spam_emails = [load_email("spam_2", name) for name in spam_filenames]

In [29]:
# reading the content of email
ham_emails[1].get_content().strip()

"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/"

### Data Exploration

In [35]:
# looking at email structure

def get_email_structure(email):
    if(isinstance(email, str)):
        return email
    payload = email.get_payload()
    if(isinstance(payload, list)):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_mail)
            for sub_mail in payload
        ]))
    else :
        return email.get_content_type()

In [36]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [37]:
structures_counter(ham_emails).most_common()

[('text/plain', 2453),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [38]:
structures_counter(spam_emails).most_common()

# it seems spam has lot of html elements compared to ham

[('text/plain', 597),
 ('text/html', 589),
 ('multipart(text/plain, text/html)', 114),
 ('multipart(text/html)', 29),
 ('multipart(text/plain)', 25),
 ('multipart(multipart(text/html))', 18),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/html, text/plain)', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/gif)',
  1),
 ('text/plain charset=us-ascii', 1),
 ('multipart(multipart(text/html), image/gif)', 1),
 ('multipart(multipart(text/plain, text/html), application/octet-stream, application/octet-stream, applic

In [39]:
# looking at email headers

for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <ilug-admin@linux.ie>
Delivered-To : yyyy@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received : from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
Received : from 64.0.57.142 [202.63.165.34] by bettyjagessa

In [41]:
# lot of headers look fishy
# but we can limit ourseleves to subject

spam_emails[0]["Subject"]

'[ILUG] STOP THE MLM INSANITY'

### splitting to train and test

In [43]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails +spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### preprocessing

In [74]:
# using regex to parse html
# we could have used beautiful soup but no we to take the scourge

import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.I | re.S)
    # remove head section
    
    text = re.sub('<style.*?>.*?</style>', '', html, flags=re.M | re.I | re.S)
    # remove style
    
    text = re.sub('<a\s.*?>', ' HYPERLINK ',text, flags= re.M | re.S | re.I)
    # replace links with hyperlink
    
    text = re.sub('<.*?>', '',text, flags=re.M | re.S)
    # remove all html tags
    
    text = re.sub(r'(\s*\n)+', '\n', text,flags=re.M | re.S)
    # replace multiple new line with single newline
    
    text = re.sub(r'(\s*\t)+', ' ', text,flags=re.M | re.S)
    # replace multiple tabs with single space
    
    return unescape(text)

In [61]:
# important line
# if statement and
# conditional array selector

html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]


In [76]:
sample_html_spam = html_spam_emails[8]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html><body onload="window.open('http://202.101.163.34:81/ultimatehgh_run/')" bgColor="#CCFF66" topmargin=1 onMouseOver="window.status=''; return true" oncontextmenu="return false" ondragstart="return false" onselectstart="return false">
<div align="center">Hello, jm@netnoteinc.com<BR><BR></div><div align="center"></div><p align="center"><b><font face="Arial" size="4">Human Growth Hormone Therapy</font></b></p>
<p align="center"><b><font face="Arial" size="4">Lose weight while building lean muscle mass<br>and reversing the ravages of aging all at once.</font><font face="Arial" size="3"><br>
</font></b><font face="Arial" size="3"> <br>
As seen on NBC, CBS, and CNN, and even Oprah! The health<br>
discovery that actually reverses aging while burning fat,<br>
without dieting or exercise! This proven discovery has even<br>
been reported on by the New England Journal of Medicine.<br>
Forget aging and dieting forever! And it's Guaranteed!</font></p>
<center><table width="481"><tr><td height="

In [77]:
# after applying our function to create plaintext
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


Hello, jm@netnoteinc.comHuman Growth Hormone Therapy
Lose weight while building lean muscle massand reversing the ravages of aging all at once.
As seen on NBC, CBS, and CNN, and even Oprah! The health
discovery that actually reverses aging while burning fat,
without dieting or exercise! This proven discovery has even
been reported on by the New England Journal of Medicine.
Forget aging and dieting forever! And it's Guaranteed!
Lose WeightBuild Muscle ToneReverse Aging
Increased LibidoDuration Of Penile ErectionHealthier Bones
Improved MemoryImproved skinNew Hair GrowthWrinkle Disappearance
 HYPERLINK Visit
  Our Web Site and Learn The Facts : Click Here
   HYPERLINK OR
  Here
  You are receiving this email as a subscriber
  to the Opt-In America Mailing List.
To remove yourself from all related maillists,just  HYPERLINK Click Here
 ...


In [78]:
# a email function to return content as plaintext
# regardless of content

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else :
            html = content
    if html:
        return html_to_plain_text(html)

In [79]:
print(email_to_text(sample_html_spam)[:100], "...")


Hello, jm@netnoteinc.comHuman Growth Hormone Therapy
Lose weight while building lean muscle massand ...


### stemming

In [80]:
import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive" ):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [81]:
#!pip install urlextract

Collecting urlextract
  Downloading https://files.pythonhosted.org/packages/06/db/23b47f32d990dea1d9852ace16d551a0003bdfc8be33094cfd208757466e/urlextract-0.14.0-py3-none-any.whl
Collecting appdirs (from urlextract)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting uritools (from urlextract)
  Downloading https://files.pythonhosted.org/packages/eb/1a/5995c0a000ef116111b9af9303349ba97ec2446d2c9a79d2df028a3e3b19/uritools-3.0.0-py3-none-any.whl
Installing collected packages: appdirs, uritools, urlextract
Successfully installed appdirs-1.4.3 uritools-3.0.0 urlextract-0.14.0


In [83]:
# need to replace URLs with "URL" word
import urlextract

url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("Will it detect github.com/sas/sa ?"))

['github.com/sas/sa']


### Word transformer

In [93]:
# putting it all into a transformer for converting emails
# to word counter

from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            
            if self.lower_case:
                text = text.lower()
                
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
                    
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
                
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)    
            
            word_counts = Counter(text.split())
            
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
            
        return np.array(X_transformed)

In [94]:
# trying out the transformer
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

[<email.message.EmailMessage object at 0x000001704514EFC8>
 <email.message.EmailMessage object at 0x000001704489E408>
 <email.message.EmailMessage object at 0x0000017044FABE48>]


array([Counter({'to': 8, 'number': 8, 'you': 7, 'pleas': 7, 'servic': 5, 'the': 5, 'and': 5, 'if': 4, 'collect': 3, 'for': 3, 'we': 3, 'a': 3, 'your': 3, 'thi': 3, 'of': 3, 'in': 3, 'more': 3, 'inform': 3, 'fax': 3, 'telephon': 3, 'debt': 2, 'provid': 2, 'busi': 2, 'charg': 2, 'account': 2, 'are': 2, 'not': 2, 'us': 2, 'can': 2, 'repli': 2, 'remov': 2, 'mail': 2, 'email': 2, 'call': 2, 'time': 2, 'at': 2, 'profession': 1, 'effect': 1, 'avail': 1, 'last': 1, 'seventeen': 1, 'year': 1, 'nation': 1, 'credit': 1, 'system': 1, 'inc': 1, 'ha': 1, 'been': 1, 'top': 1, 'flight': 1, 'over': 1, 'institut': 1, 'healthcar': 1, 'onli': 1, 'low': 1, 'flat': 1, 'fee': 1, 'less': 1, 'than': 1, 'per': 1, 'all': 1, 'proce': 1, 'forward': 1, 'directli': 1, 'agenc': 1, 'wish': 1, 'will': 1, 'report': 1, 'unpaid': 1, 'experian': 1, 'formerli': 1, 'trw': 1, 'transunion': 1, 'equifax': 1, 'there': 1, 'is': 1, 'no': 1, 'import': 1, 'let': 1, 'know': 1, 'be': 1, 'simpli': 1, 'debt_collector': 1, 'btamail': 1, 

### word to vector

In [112]:
print(Counter())
#print(word_count)

Counter()


In [99]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
                # what is the logic behind 10
                
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        # returns an object with most common words and vocabulary 
        return self
        
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [101]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
# calls fit then transform

X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int32'>'
	with 18 stored elements in Compressed Sparse Row format>

In [102]:
X_few_vectors.toarray()

# 133 means there are 133 new words not part of vocaublary in second email
# 9 means the firstword in vocabulary is used 8 times inthis email

array([[161,   8,   5,   0,   0,   8,   7,   5,   7,   0,   3],
       [133,   4,   5,  15,  13,   1,   1,   2,   0,   0,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   7,   0]],
      dtype=int32)

In [103]:
vocab_transformer.vocabulary_

{'number': 1,
 'the': 2,
 'log': 3,
 'nonspam': 4,
 'to': 5,
 'you': 6,
 'and': 7,
 'pleas': 8,
 'hyperlink': 9,
 'of': 10}

In [110]:
vocab_transformer.most_common_

[('number', 12),
 ('the', 10),
 ('log', 10),
 ('nonspam', 10),
 ('to', 9),
 ('you', 8),
 ('and', 7),
 ('pleas', 7),
 ('hyperlink', 7),
 ('of', 6)]

### Pipeline and tansformation

In [104]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

### Logistic regression

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3 )
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.980, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.981, total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


0.982893935108757

### precision and recall

In [107]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: ", precision_score(y_test, y_pred))
print("recall: ", recall_score(y_test, y_pred))

Precision:  0.9808429118773946
recall:  0.9624060150375939
