# PLAN

- [x] Follow original code to the T.
- [ ] Use Beautiful Soup
- [ ] Use Word2Vec
- [x] Implement GridSearch
- [x] Use other classifiers
- [ ] TODO: fix array/sequence error

# ENVIRONMENT

In [1]:
# Based on the lessons on "Hands-On Machine Learning with Scikit-Learn & Tensorflow" by Aurelien Geron
# http://github.com/ageron/handson-ml

import os
import tarfile
from six.moves import urllib

import email
import email.policy

from collections import Counter

import numpy as np
from sklearn.model_selection import train_test_split

import re
from html import unescape

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import csr_matrix

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_hard_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

# ACQUISITION

In [2]:
def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

fetch_spam_data()

In [3]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [4]:
len(ham_filenames)

2500

In [5]:
len(spam_filenames)

500

In [6]:
def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

# WRANGLING

In [8]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [9]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [10]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [11]:
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [12]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [13]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [14]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [15]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [16]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=493)

In [17]:
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [18]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">

<html>
<head>
<title>www.low-interest-rates.net</title>
<style>
<!--
td {
	font-family: Verdana, Tahoma, sans-serif;
	font-size: 12px;
	color: black;
}

text {
	font-family: Verdana, Tahoma, sans-serif;
	font-size: 11px;
	color: black;
}

text2 {
	font-family: Verdana, Tahoma, sans-serif;
	font-size: 12px;
	color: black;
}

header {
	font-family: Trebuchet MS, Verdana, Tahoma, sans-serif;
	font-size: 17px;
	color: white;
}

title {
	font-family: Trebuchet MS, Verdana, Tahoma, sans-serif;
	font-size: 20px;
	color: #D6CFAB;
	font-weight: bold;
}

h4 {
	color: #B5C943;
	font-family: Tahoma, sans-serif;
}

footer {
	font-family: Verdana, Tahoma, sans-serif;
	font-size: 10px;
	color: white;
	font-weight: bold;
}

a.footer {color: white; text-decoration: none;}
a.footer:visited {color: white; text-decoration: none;}
a.footer:hover {color: white; text-decoration: underline;}

button {
	font-size: 11px;
	border-width: 0;
	background-color: #47

In [19]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


      quotepoolmortgage™
We're different and it's a difference that can save you time and money.
We can help you:
	•Refinance to get a lower rate
	•Consolidate your high-interest debt
	•Lower your monthly payments
	•Get extra cash for a vacation
	•Pay for tuition
	•Start home improvements
	•Purchase a New Home
We understand that there are a lot of decisions to make when securing a mortgage loan or refinancing your current loan.  That's why we have mortgage EXPERTS with years of EXPERIENCE to help you make the right decisions.
Credit doesn't have to be an issue either.  Whether your credit's perfect or less than perfect, we can help you find the best deal on your home loan. Our quick, FREE, easy form will put you in contact with the top brokers in the business!
DON'T Miss The Opportunity to SAVE!
Now is the perfect time to secure your mortgage loan.  With our country in a recession, the federal government keeps LOWERING INTEREST RATES to help stimulate the economy.  It is a win-win sit

In [20]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [21]:
print(email_to_text(sample_html_spam)[:100], "...")


      quotepoolmortgage™
We're different and it's a difference that can save you time and money.
We ...


In [22]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [23]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [24]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [25]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'the': 4, 'have': 4, 'number': 3, 'more': 3, 'will': 3, 'if': 2, 'then': 2, 'folk': 2, 'that': 2, 'they': 2, 'still': 2, 'to': 2, 'like': 2, 'be': 2, 'pretti': 2, 'it': 2, 'of': 2, 'on': 1, 'mon': 1, 'sep': 1, 'tom': 1, 'wrote': 1, 'set': 1, 'pass': 1, 'around': 1, 'enough': 1, 'peopl': 1, 'these': 1, 'work': 1, 'them': 1, 'now': 1, 'while': 1, 'are': 1, 'legal': 1, 'left': 1, 'behind': 1, 'in': 1, 'possibl': 1, 'probabal': 1, 'copyright': 1, 'chillout': 1, 'and': 1, 'doesnt': 1, 'happen': 1, 'than': 1, 'not': 1, 'we': 1, 'get': 1, 'blacknet': 1, 'guerilla': 1, 'pnumberp': 1, 'soon': 1, 'packag': 1, 'into': 1, 'wormcod': 1, 'with': 1, 'an': 1, 'initi': 1, 'userbas': 1, 'a': 1, 'few': 1, 'k': 1, 'mnode': 1, 'give': 1, 'you': 1, 'bulletproof': 1, 'plausibl': 1, 'deniabl': 1, 'for': 1, 'use': 1, 'all': 1, 'manner': 1, 'shade': 1}),
       Counter({'number': 16, 'thi': 9, 'url': 9, 'perl': 8, 'the': 6, 'to': 6, 'on': 5, 'from': 5, 'septemb': 4, 'use': 3, 'week': 3, 'review'

In [26]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [27]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [28]:
X_few_vectors.toarray()

array([[ 79,   3,   4,   2,   1,   1,   2,   0,   1,   0,   4],
       [168,  16,   6,   6,   3,   3,   1,   9,   1,   9,   1],
       [293,   9,  16,  10,  12,   9,  10,   4,  11,   3,   6]],
      dtype=int64)

In [29]:
vocab_transformer.vocabulary_

{'number': 1,
 'the': 2,
 'to': 3,
 'you': 4,
 'and': 5,
 'of': 6,
 'url': 7,
 'a': 8,
 'thi': 9,
 'have': 10}

In [30]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

# MODEL

## Logistic Regression

In [31]:
log_clf = LogisticRegression(solver="liblinear", random_state=493)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.980, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.991, total=   0.1s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished


0.9862546725333425

In [32]:
# param_grid = [{'solver': ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],}]

# log_clf = LogisticRegression()
# grid_search = GridSearchCV(log_clf, param_grid)
# grid_search.fit(X_train,y_train)

In [33]:
# grid_search.best_params_

In [34]:
# grid_search.best_score_

In [35]:
# y_pred = grid_search.predict(X_test)
# accuracy_score(y_test, y_pred)

## Voting Classifier

In [36]:
# log_clf = LogisticRegression()
# rnd_clf = RandomForestClassifier()
# svm_clf = SVC()

In [37]:
# voting_clf = VotingClassifier(
#     estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
#     voting='hard'
#     )
# voting_clf.fit(X_train,y_train)

### TEST

In [38]:
X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="newton-cg", random_state=493)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 98.95%
Recall: 94.95%
