In [1]:
import ast
import os
import shutil
import tarfile
from urllib.parse import urljoin

from utils.pre_processing import email_text, text_tokenizer, get_text, email, parse_html

import requests 
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# Download

In [2]:
url = 'https://spamassassin.apache.org/old/publiccorpus/'
url_obsolete = 'https://spamassassin.apache.org/old/publiccorpus/obsolete/'

In [3]:
root_dir = 'arquivos'

In [4]:
os.mkdir(root_dir)

In [5]:
def download_arquivos(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.text)
    links = soup.find_all('a', attrs={'href':lambda x: x.endswith('tar.bz2')})
    
    for tag in links:
        nome_arquivo = tag['href']
        url_arquivo = urljoin(url, tag['href'])
        req = requests.get(url_arquivo)
        
        with open(os.path.join(root_dir, nome_arquivo), 'wb') as file:
            file.write(req.content)

In [6]:
download_arquivos(url_obsolete)
download_arquivos(url)

# Dataset

In [7]:
for arquivo in tqdm(os.listdir(root_dir)):
    path_tar = os.path.join(root_dir, arquivo)
    pasta = arquivo.removesuffix('.tar.bz2')
    
    tar_file = tarfile.open(path_tar)
    tar_file.extractall(path=os.path.join(root_dir, pasta))
    tar_file.close()
    
    os.remove(path_tar)

  0%|          | 0/18 [00:00<?, ?it/s]

In [8]:
for pasta in tqdm(os.listdir(root_dir)):
    for subpasta in os.listdir(os.path.join(root_dir, pasta)):
        for arquivo in os.listdir(os.path.join(root_dir, pasta, subpasta)):
            filepath = os.path.join(root_dir, pasta, subpasta, arquivo)
            shutil.move(filepath, os.path.join(root_dir, pasta, arquivo))
            
        os.rmdir(os.path.join(root_dir, pasta, subpasta))

  0%|          | 0/18 [00:00<?, ?it/s]

In [9]:
dicio = {'date': [], 'category': [], 'email': [], 'is_spam':[]}
for pasta in tqdm(os.listdir(root_dir), leave=True):
    date, categoria = pasta.split('_', 1)
    is_spam = 1 if 'spam' in categoria else 0
    
    for arquivo in tqdm(os.listdir(os.path.join(root_dir, pasta)), leave=False):
        
        with open(os.path.join(root_dir, pasta, arquivo), 'rb') as file:
            content = file.read()
            
        dicio['date'].append(pd.to_datetime(date, format='%Y%m%d'))
        dicio['category'].append(categoria)
        dicio['email'].append(content)
        dicio['is_spam'].append(is_spam)

shutil.rmtree(root_dir)
df = pd.DataFrame(dicio)

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1401 [00:00<?, ?it/s]

  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2551 [00:00<?, ?it/s]

  0%|          | 0/501 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/501 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/501 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/2551 [00:00<?, ?it/s]

  0%|          | 0/1397 [00:00<?, ?it/s]

In [10]:
df.head()

Unnamed: 0,date,category,email,is_spam
0,2002-12-04,spam_2,b'From lvi300702@free.fr Tue Aug 6 12:50:59 ...,1
1,2002-12-04,spam_2,b'Received: from hq.pro-ns.net (localhost [127...,1
2,2002-12-04,spam_2,b'From jm@netnoteinc.com Mon Jun 24 17:52:12 ...,1
3,2002-12-04,spam_2,b'From edum@hkem.com Mon Jun 24 17:04:26 2002...,1
4,2002-12-04,spam_2,b'From blissptht65@yahoo.com Thu Jul 12 06:33...,1


In [13]:
df.to_feather('spamassassin.feather')

In [14]:
df2 = pd.read_feather('spamassassin.feather')

In [None]:
email_text = email_text(df['email'])
df['email_parsed'] = text_tokenizer(email_text)
df.to_feather('spamassassin_parsed.feather')

Parsing emails...
Getting emails' text...
Cleaning emails' text...


100%|█████████████████████████████████████| 21503/21503 [04:26<00:00, 80.58it/s]
