# SPAM Classifier


In [3]:
import tarfile
from pathlib import Path
import urllib


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [4]:
ham_dir, spam_dir = fetch_spam_data()

# load all the emails
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]

spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) >20]

# print amount of files
print(f"There are {len(ham_filenames)} regulars emails and {len(spam_filenames)} spams")

Downloading datasets/spam/ham.tar.bz2
Downloading datasets/spam/spam.tar.bz2
There are 2500 regulars emails and 500 spams


In [5]:
# Parse the emails
import email
import email.policy


def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)


ham_emails = [load_email(filepath) for filepath in ham_filenames]

spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [6]:
# check some emails to have a feeling of what they look like

print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [7]:
print(spam_emails[25].get_content().strip())

DEAR FRIEND,I AM MRS.  SESE-SEKO WIDOW OF LATE PRESIDENT MOBUTU
SESE-SEKO OF ZAIRE? NOW KNOWN AS DEMOCRATIC REPUBLIC
OF CONGO (DRC).  I AM MOVED TO WRITE YOU THIS LETTER,
THIS WAS IN CONFIDENCE  CONSIDERING MY PRESENTCIRCUMSTANCE AND SITUATION.
I ESCAPED ALONG WITH MY HUSBAND AND TWO OF OUR SONS
GEORGE  KONGOLO  AND BASHER  OUT OF DEMOCRATIC REPUBLIC OF
CONGO (DRC) TO ABIDJAN, COTE D'IVOIRE WHERE MY FAMILY
AND I SETTLED, WHILE WE LATER MOVED  TO SETTLED IN
MORROCO WHERE MY HUSBAND LATER DIED OF CANCER
DISEASE. HOWEVER DUE TO THIS SITUATION WE DECIDED TO
CHANGED  MOST OF MY HUSBAND'S BILLIONS OF DOLLARS
DEPOSITED IN SWISS BANK AND OTHER COUNTRIES INTO OTHER
FORMS OF MONEY CODED FOR  SAFE PURPOSE BECAUSE THE NEW
HEAD OF STATE OF (DR) MR LAURENT  KABILA HAS MADE
ARRANGEMENT WITH THE SWISS GOVERNMENT AND OTHER
EUROPEAN COUNTRIES TO FREEZE ALL MY LATE HUSBAND'S
TREASURES  DEPOSITED IN SOME EUROPEAN COUNTRIES. HENCE
MY CHILDREN AND I DECIDED LAYING LOW IN AFRICA TO
STUDY THE SITUATION TILL  

In [11]:
# check the different types of email structures

def get_email_structure(email):
    if isinstance(email, str):
        return email

    payload = email.get_payload()

    if isinstance(payload, list):

        multipart = ", ".join([get_email_structure(subemail) for subemail in payload])

        return f"multipart({multipart})"
    else:
        return email.get_content_type()


from collections import Counter

def structures_counter(emails):
    structures = Counter()

    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1

    return structures

print(f"Structure - REGULAR EMAILS: \n{structures_counter(ham_emails)}\n\nSPAM EMAILS:\n {structures_counter(spam_emails)}")


Structure - REGULAR EMAILS: 
Counter({'text/plain': 2408, 'multipart(text/plain, application/pgp-signature)': 66, 'multipart(text/plain, text/html)': 8, 'multipart(text/plain, text/plain)': 4, 'multipart(text/plain)': 3, 'multipart(text/plain, application/octet-stream)': 2, 'multipart(text/plain, text/enriched)': 1, 'multipart(text/plain, application/ms-tnef, text/plain)': 1, 'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 1, 'multipart(text/plain, video/mng)': 1, 'multipart(text/plain, multipart(text/plain))': 1, 'multipart(text/plain, application/x-pkcs7-signature)': 1, 'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 1, 'multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))': 1, 'multipart(text/plain, application/x-java-applet)': 1})

SPAM EMAILS:
 Counter({'text/plain': 218, 'text/html': 183, 'multipart(text/plain, text/html)': 45, 'multipart(te