In [3]:
#Fetch data
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [4]:
fetch_spam_data()

In [5]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [6]:
len(ham_filenames)

2500

In [7]:
len(spam_filenames)

500

In [8]:
#Using email module to parse emails
import email
import email.policy

def load_email(is_spam, filename, spam_path = SPAM_PATH):
    direc = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, direc, filename), "rb") as f:
        return email.parser.BytesParser(policy = email.policy.default).parse(f)

In [11]:
ham_emails = [load_email(is_spam = False, filename = name) for name in ham_filenames]
spam_emails = [load_email(is_spam = True, filename = name) for name in spam_filenames]

Return-Path: <exmh-workers-admin@spamassassin.taint.org>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received: from listman.spamassassin.taint.org (listman.spamassassin.taint.org
 [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id
 g7MBYrZ04811 for    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002
 12:34:53 +0100
Received: from listman.spamassassin.taint.org (localhost.localdomain
 [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id 8386540858;
 Thu, 22 Aug 2002    07:35:02 -0400 (EDT)
Delivered-To: exmh-workers@listman.spamassassin.taint.org
Received: from int-mx1.corp.spamassassin.taint.org
 (int-mx1.corp.spamassassin.taint.or

In [21]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [45]:
def get_email_structure(email):
    #return email if it is 
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join(
            [get_email_structure(sub_email) for sub_email in payload]))
    else: return email.get_content_type()

In [46]:
#Counter of each type of email
from collections import Counter
def structures_counter(emails):
    structures = Counter() #create a counter dictionary
    for email in emails:
        structure = get_email_structure(email) #get the stucture of the email
        structures[structure] += 1 #increment counter
    return structures

In [49]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [62]:
#Email headers
for header, value in spam_emails[0].items():
    print(header,":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [76]:
spam_emails[0]["Delivered-To"].split('.')

['zzzz@localhost', 'spamassassin', 'taint', 'org']

In [78]:
#Split out training and test sets
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1]*len(spam_emails))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [116]:
#Preprocess the data
#Get rid of HTML using BeautifulSoup instead of custom transformer
from bs4 import BeautifulSoup
html_spam_emails = [email for email in X_train[y_train == 1]
                   if get_email_structure(email) == "text/html"]
sample_html_email = html_spam_emails[7]
#print(sample_html_email.get_content())

soup = BeautifulSoup(sample_html_email.get_content(), 'lxml')
print(soup.text)





















ÿFFFFA9 
      Copyright 2002 - All rights reservedIf you would no longer like us 
      to contact you or feel that you havereceived this email in error, 
      please click here to 
      unsubscribe.
 



In [117]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: #if there's encoding issues
            content = str(part.get_payload())
        if ctype == 'test/plain':
            return content
        else:
            html = content
    if html:
        new_html = BeautifulSoup(html, 'lxml')
        return new_html.text

In [118]:
print(email_to_text(sample_html_email))





















ÿFFFFA9 
      Copyright 2002 - All rights reservedIf you would no longer like us 
      to contact you or feel that you havereceived this email in error, 
      please click here to 
      unsubscribe.
 



In [123]:
#Replace URLs with the word 'URL'
import urlextract
url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("check check https://google.com and yo.io"))

['https://google.com', 'yo.io']


In [136]:
#Custom transformer to convert emails into word counters
from sklearn.base import BaseEstimator, TransformerMixin
import re
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers = True, lower_case = True, remove_punctuation = True,
                replace_urls = True, replace_numbers = True, stemming = True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    
    def fit(self, X, y = None): return self
    def transform(self, X, y = None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key = lambda url: len(url), reverse = True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)


In [137]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'url': 2, 'for': 2, 'url:': 1, 'date:': 1, 'number-number-numbertnumber:number:number-numb': 1, '...': 1, 'not': 1, 'updating:': 1, "i'm": 1, 'do': 1, 'the': 1, 'guestblog': 1, 'at': 1, 'bo': 1, 'boing[number].': 1, 'now': 1, 'to': 1, 'find': 1, 'an': 1, 'excus': 1, 'miss': 1, 'last': 1, 'week.': 1, '[number]': 1}),
       Counter({'>': 16, 'a': 3, '"i': 2, 'not': 2, 'have': 2, 'that': 2, 'from:': 2, 'fork': 2, '-----': 2, 'onc': 2, 'said': 2, 'did': 1, 'sex': 1, 'with': 1, 'woman."': 1, '-----origin': 1, 'message-----': 1, 'fork-admin@xent.com': 1, '[mailto:fork-admin@xent.com]': 1, 'on': 1, 'behalf': 1, 'of': 1, 'mr.': 1, 'sent:': 1, 'monday,': 1, 'septemb': 1, 'number,': 1, 'number': 1, 'number:numb': 1, 'pm': 1, 'to:': 1, 'subject:': 1, 're:': 1, 'goodby': 1, 'global': 1, 'warm': 1, 'origin': 1, 'messag': 1, '"john': 1, 'hall"': 1, 'green': 1, 'if': 1, 'the': 1, 'spot': 1, 'owl': 1, "hadn't": 1, 'exist': 1, 'they': 1, 'would': 1, 'had': 1, 'to': 1, 'invent': 1, 'it.

In [129]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [141]:
#Vectorize the word count into a sparse matrix 
from scipy.sparse import csr_matrix

class VectorizeWordCount(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size = 10000):
        self.vocabulary_size = vocabulary_size
    
    #Fit method will build the vocab list by most frequent
    def fit(self, X, y = None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        
        return self
        
        
    #Transform method will convert counts to vectors
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [145]:
vocab_transformed = VectorizeWordCount(vocabulary_size=10)
X_few_vectors = vocab_transformed.fit_transform(X_few_wordcounts)
X_few_vectors.toarray()

array([[21,  0,  1,  2,  0,  0,  0,  1,  1,  0,  0],
       [72,  0,  4,  3,  4,  3,  3,  0,  1,  3,  3],
       [59, 16,  1,  0,  1,  1,  1,  2,  1,  0,  0]], dtype=int64)

In [146]:
vocab_transformed.vocabulary_

{'>': 1,
 'to': 2,
 'url': 3,
 'on': 4,
 'septemb': 5,
 'messag': 6,
 'not': 7,
 'the': 8,
 'use': 9,
 'thi': 10}

In [147]:
#Create pipeline
from sklearn.pipeline import Pipeline
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", VectorizeWordCount()),
])
X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [148]:
#Train on logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver = "lbfgs", random_state = 0)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv = 3, verbose = 3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] .................................... , score=0.981, total=   0.3s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s finished


0.9854166666666666

In [153]:
#check precision and recall
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)
log_clf = LogisticRegression(solver = "lbfgs", random_state = 0)
log_clf.fit(X_train_transformed, y_train)

#fit the test set
y_pred = log_clf.predict(X_test_transformed)
print(y_pred.score())

#Precision and Recall
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))





" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AttributeError: 'numpy.ndarray' object has no attribute 'score'