In [1]:
import os
import tarfile
import urllib.request

In [3]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("data", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [4]:
fetch_spam_data()


  tar_bz2_file.extractall(path=SPAM_PATH)


Next, let's load all the emails:

In [5]:
S_PATH = "../data"
H_PATH = "../data"

HAM_DIR = os.path.join(H_PATH, "easy_ham")
SPAM_DIR = os.path.join(S_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [6]:
len(ham_filenames)

2500

In [7]:
len(spam_filenames)

500

We can use Python's `email` module to parse these emails (this handles headers, encoding, and so on):

In [8]:
import email
import email.parser
import email.policy

def load_email(is_spam, filename, spam_path=S_PATH):

    directory = "spam" if is_spam else "easy_ham"

    with open (os.path.join(S_PATH, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
    


In [9]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [10]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [11]:
print(ham_emails[76].get_content().strip())

from http://www.arstechnica.com/

"There has mostly been talk thus far and little action, but the Department
of Justice says it may be ready to file criminal lawsuits against
individuals [1] who distribute or receive unauthorized copyrighted
material over the Internet. Deputy Assistant Attorney General John Malcolm believes
that "criminal prosecutions of copyright offenders are now necessary to
preserve the viability of America's content industries." Malcolm also
believes that people who trade copyrighted material think they are
participating in a legal activity. I certainly think people who download
copyrighted works understand that such distribution--barring provisions
such as fair use--is not authorized, and it is not surprising to see
businesses continue to look for means to discourage distribution of
copyrighted works.


"Some prosecutions that make that clear could be very helpful...I think
they would think twice if they thought there was a risk of criminal
prosecution," said [RI

In [12]:
print(spam_emails[76].get_content().strip())

OWOLABI & ASSOCIATE,
FALOMO, IKOYI.
LAGOS - NIGERIA.


YOUR KIND ATTN.,



RE: REQUEST FOR MUTUALLY BENEFITTING ENDEAVOUR.

I humbly crave your indulgence in sending you this
mail, if the contents does not meet with your personal
and business ethics, I apologize in advance.

I am Barrister Kola Owolabi( attorney at law), I
represent Alhaji Ishmaila Ibrahim Gwarzo’s estates. Alhaji Gwarzo was the chief
security advicer of the then military leader of this country(Nigeria) in the
person of Late General Sani Abacha who died on the 8th of June 1998. With the
advent of a new democratic
dispensation in the country under the leadership of
Gen. Olusegun Obasanjo (Rtd), my client has come under
severe persecution due to the sensitive position he
held in the last military regime, presently he is
under house arrest restricted only to the confines of
his village.

The main purpose of this mail is to intimate you of a
business proposal that might be of interest to you. My
client has informed me of t

Some emails are actually multipart, with images and attachments (which can have their own attachments). Let's look at the various types of structure we have:

In [13]:
def get_email_structure(email):
    if isinstance(email,str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()

In [14]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures



In [15]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]