### Loading dataset

In [10]:
import os

HAM_DIR = "easy_ham"
SPAM_DIR = "spam_2"
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name)>20]


In [11]:
print(len(ham_filenames), len( spam_filenames))

2551 1396


In [26]:
# python email module to parse the emails
import email
import email.policy

def load_email(directory, filename, spam_path=SPAM_PATH):
    with open(os.path.join(directory,filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [27]:
ham_emails = [load_email("easy_ham", name) for name in ham_filenames ]
spam_emails = [load_email("spam_2", name) for name in spam_filenames]

In [29]:
# reading the content of email
ham_emails[1].get_content().strip()

"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/"

### Data Exploration

In [35]:
# looking at email structure

def get_email_structure(email):
    if(isinstance(email, str)):
        return email
    payload = email.get_payload()
    if(isinstance(payload, list)):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_mail)
            for sub_mail in payload
        ]))
    else :
        return email.get_content_type()

In [36]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [37]:
structures_counter(ham_emails).most_common()

[('text/plain', 2453),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [38]:
structures_counter(spam_emails).most_common()

# it seems spam has lot of html elements compared to ham

[('text/plain', 597),
 ('text/html', 589),
 ('multipart(text/plain, text/html)', 114),
 ('multipart(text/html)', 29),
 ('multipart(text/plain)', 25),
 ('multipart(multipart(text/html))', 18),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/html, text/plain)', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/gif)',
  1),
 ('text/plain charset=us-ascii', 1),
 ('multipart(multipart(text/html), image/gif)', 1),
 ('multipart(multipart(text/plain, text/html), application/octet-stream, application/octet-stream, applic

In [39]:
# looking at email headers

for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <ilug-admin@linux.ie>
Delivered-To : yyyy@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received : from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
Received : from 64.0.57.142 [202.63.165.34] by bettyjagessa

In [41]:
# lot of headers look fishy
# but we can limit ourseleves to subject

spam_emails[0]["Subject"]

'[ILUG] STOP THE MLM INSANITY'

### splitting to train and test

In [43]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails +spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### preprocessing

In [69]:
# using regex to parse html
# we could have used beautiful soup but no we to take the scourge

import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.I | re.S)
    # remove head section
    
    text = re.sub('<style.*?>.*?</style>', '', html, flags=re.M | re.I | re.S)
    # remove style
    
    text = re.sub('<a\s.*?>', ' HYPERLINK ',text, flags= re.M | re.S | re.I)
    # replace links with hyperlink
    
    text = re.sub('<.*?>', '',text, flags=re.M | re.S)
    # remove all html tags
    
    text = re.sub(r'(\s*\n)+', '\n', text,flags=re.M | re.S)
    # replace multiple new line with single newline
    
    text = re.sub(r'(\s*\t)+', ' ', text,flags=re.M | re.S)
    # replace multiple tabs with single space
    
    return unescape(text)

In [61]:
# important line
# if statement and
# conditional array selector

html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]


In [62]:
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<!-- saved from url=(0022)http://internet.e-mail -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
	<title></title>
</head>
<style type="text/css">
A.yellow:link {text-decoration: none; color: #FFF200}
A.yellow:visited {text-decoration: none; color: #FFF200}
A.yellow:active {text-decoration: none; color: #EFEFC8}
A.yellow:hover {text-decoration: none; color: #FFFFFF}
</style>
<body topmargin="0" bgcolor="#FFFFFF">
<div align="center"><br>
  <table width="650" border="1" cellspacing="0" cellpadding="2" bordercolor="#000000">
    <tr>
      <td>
        <table width="650" border="0" cellspacing="0" cellpadding="4" align="center">
          <tr> 
            <td bgcolor="003366"><font size="2" color="#FFFFFF" face="Arial, Helvetica, sans-serif">. 
              U N I V E R S I T Y . D I P L O M A S .</font></td>
          </tr>
          <tr> 
            <td bgcolor="000000" height="40" valign="middle"> 
              <div align="center"><font face="Verda

In [70]:
# after applying our function to create plaintext
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


            .
              U N I V E R S I T Y . D I P L O M A S .
              Do
                you want for a prosperous future, increased money earning power,
                and the respect of all?
                We
                  can assist with Diplomas from prestigious non-accredited universities
                  based on your present knowledge and life experience.
                No required tests, classes, books, or
                interviews.
              Bachelors,
                masters, MBA, and doctorate (PhD) diplomas available in the field
                of your choice - that's right, you can become a Doctor, Lawyer or Accountant and receive
                all the benefits and admiration that comes with it!
              No
                one is turned down!
              Confidentiality
                assured - Change your Life Today!
                  Either  HYPERLINK Click Here HYPERLINK  or
                  you can call us 24 hours a day, 7 days a 

In [58]:
# a email function to return content as plaintext
# regardless of content

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else :
            html = content
    if html:
        return html_to_plain_text(html)

In [59]:
print(email_to_text(sample_html_spam)[:100], "...")


            .
              U N I V E R S I T Y . D I P L O M A S .
              Do
               ...
