In [1]:
# Importando bibliotecas

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import urlextract

# Extraction

In [2]:
# Baixando dados



In [3]:
# Descomprimindo



In [4]:
# Criando lista de nomes dos emails


ham_dir = '../raw_data/easy_ham/'
spam_dir = '../raw_data/spam/'

ham_email_names = []
spam_email_names = []

for email_name in sorted(os.listdir(ham_dir)):
    ham_email_names.append(email_name)
    
for email_name in sorted(os.listdir(spam_dir)):
    spam_email_names.append(email_name)

In [5]:
# Checando tamanho das instâncias de spam e ham

len(ham_email_names), len(spam_email_names)

(2500, 500)

In [6]:
# Carregando hams como messages

import email
import email.policy

email_policy = email.policy.default
parser = email.parser.BytesParser(policy=email_policy)

ham_emails_messages = []

for i in range(len(ham_email_names)):
    with open(ham_dir + ham_email_names[i], 'rb') as f: # rb serve para ler como bytes, sem decodificar
        email_file = parser.parse(f)
    ham_emails_messages.append(email_file) # strip remove espaços iniciais e finais
    
# Carregando spams como messages

spam_emails_messages = []

for i in range(len(spam_email_names)):
    with open(spam_dir + spam_email_names[i], 'rb') as f:
        email_file = parser.parse(f)
    spam_emails_messages.append(email_file)

# Transform

### Explorando partes do emails

In [7]:
# Como algumas mensagens são multipart, temos que fazer uma função para parsear

def get_email_structure(email):
    if isinstance(email, str): # Checando se já temos um email como texto
        return email
    
    payload = email.get_payload()
    
    if isinstance(payload, list):
        x = ", ".join([get_email_structure(sub_email) for sub_email in payload]) # Iterando sob as partes
        return f'multipart({x})'
    else:
        return email.get_content_type()
    
    
print(get_email_structure(ham_emails_messages[13])) # Testando em um email multipart
print(get_email_structure(ham_emails_messages[0])) # Testando em um email comum

multipart(text/plain, application/pgp-signature)
text/plain


## Explorando headers

In [9]:
for header, value in spam_emails_messages[1].items():
    print(header, value)

Return-Path <ilug-admin@linux.ie>
Delivered-To zzzz@localhost.spamassassin.taint.org
Received from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id A7FD7454F6	for <zzzz@localhost>; Thu, 22 Aug 2002 08:27:38 -0400 (EDT)
Received from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:27:38 +0100 (IST)
Received from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MCJiZ06043 for    <zzzz-ilug@jmason.org>; Thu, 22 Aug 2002 13:19:44 +0100
Received from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29323; Thu, 22 Aug 2002 13:18:52 +0100
Received from email.qves.com ([67.104.83.251]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29282 for <ilug@linux.ie>; Thu,    22 Aug 2002 13:18:37 +0100
Received from qvp0091 ([169.254.6.22]) by email.qves.com with Microsoft    SMTPSVC(5

In [10]:
spam_emails_messages[1]['Subject']

'[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206'

### Beatiful Soup

In [11]:
def email_to_text(email):
    html = None
    for part in email.walk():
        content_type = part.get_content_type()
        if not content_type in ("text/plain", "text/html"): # Filtra todos que não são text ou html
            continue
        
        # Tente pegar os conteudos, se falhar por decode, pegue o payload e converta
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        
        if content_type == 'text/plain':
            return content
        else:
            html = content
    
    if html:
        return BeautifulSoup(html).get_text() 

In [12]:
email_to_text(spam_emails_messages[1]).strip()

"1) Fight The Risk of Cancer!\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\nhttp://www.adclick.ws/p.cfm?o=249&s=pk007\n\n3) Get the Child Support You Deserve - Free Legal Advice\nhttp://www.adclick.ws/p.cfm?o=245&s=pk002\n\n4) Join the Web's Fastest Growing Singles Community\nhttp://www.adclick.ws/p.cfm?o=259&s=pk007\n\n5) Start Your Private Photo Album Online!\nhttp://www.adclick.ws/p.cfm?o=283&s=pk007\n\nHave a Wonderful Day,\nOffer Manager\nPrizeMama\n\n\n\n\n\n\n\n\n\n\n\n\n\nIf you wish to leave this list please use the link below.\nhttp://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258\n\n\n-- \nIrish Linux Users' Group: ilug@linux.ie\nhttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\nList maintainer: listmaster@linux.ie"

## Removendo URLs

In [13]:
def remove_url(email):
    url_extractor = urlextract.URLExtract()
    url_list = url_extractor.find_urls(str(email))

    for link in url_list:
        email = email.replace(link, 'URL')
    
    return email

# Load

In [21]:
spam_list = []
ham_list = []

for i in range(len(spam_email_names)):
    email_text = remove_url(email_to_text(spam_emails_messages[i]))
    email_structure = get_email_structure(spam_emails_messages[i])
    spam_list.append([spam_email_names[i], email_text, email_structure, 'spam'])
    
    
for i in range(len(ham_email_names)):
    email_text = remove_url(email_to_text(ham_emails_messages[i]))
    email_structure = get_email_structure(ham_emails_messages[i])
    ham_list.append([ham_email_names[i], email_text, email_structure, 'ham'])
    
col_names = ['id', 'email_content', 'email_structure', 'email_type']
df_spam = pd.DataFrame(spam_list, columns=col_names)
df_ham = pd.DataFrame(ham_list, columns=col_names)

In [22]:
df = pd.concat([df_spam, df_ham])

In [23]:
df.to_csv('../data/treated_data/data.csv', index=False)