In [35]:
#Apache Spam Classifier

# Get the Data

In [6]:
import requests
import os
from tqdm import tqdm

url = 'https://spamassassin.apache.org/old/publiccorpus/'
files = ['20021010_easy_ham.tar.bz2','20021010_spam.tar.bz2']
    

    
def create_directory(cwd: str, path_extension: str):
    new_path = cwd + path_extension
    if not os.path.exists(new_path):
        os.makedirs(new_path)


        
def dl_from_URL(urls: list, target_path: str):
    print(f"Start: {len(urls)} files to download")
    
    for idx, url in tqdm(enumerate(urls)):
        filename = url.split('/')[-1]
        destination_path = target_path + filename
        r = requests.get(url)
        with open(destination_path, 'wb') as output_file:
            output_file.write(r.content)
        




In [7]:
cwd = os.getcwd()
source_path = '/data/source/'
DATA_PATH = source_path 
urls = [url + file for file in files]



create_directory(cwd, source_path)
dl_from_URL(urls, cwd+DATA_PATH)

Start: 2 files to download


2it [00:03,  1.90s/it]


In [8]:
import tarfile

def decompress_data(source_path: str, decmprsd_path: str):
    
    directory = os.fsencode(source_path)

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".bz2"): 
            tar = tarfile.open(source_path+filename, mode='r:bz2')
            tar.extractall(decmprsd_path)
        else:
             continue


In [9]:
decmprsd_path = '/data/decompressed/'
create_directory(cwd, decmprsd_path)
decompress_data(cwd+DATA_PATH, cwd+decmprsd_path)

# Prepare the Data

In [10]:
import random
import glob
import shutil
import re




def split_train_test_by_id(spam_path: str, ham_path: str, seed: int, training_path: str, test_path: str ):
    random.seed(seed)
    num_spam = len(os.listdir(spam_path))
    num_ham = len(os.listdir(ham_path))
    spam_test_indicies = random.sample([i for i in range(num_spam)], k=int(num_spam*.2))
    ham_test_indicies = random.sample([i for i in range(num_ham)], k=int(num_ham*.2))


    paths = [test_path, training_path]
    for path in paths:
        shutil.rmtree(path)
        create_directory('', path)
    
    for file in os.listdir(spam_path):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        if _id in spam_test_indicies:
            os.popen(f"cp {spam_path+filename} {test_path+filename+'.spam'}") 

        else:
            os.popen(f"cp {spam_path+filename} {training_path+filename+'.spam'}") 

        
    for file in os.listdir(ham_path):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        if _id in ham_test_indicies:
            os.popen(f"cp {ham_path+filename} {test_path+filename+'.ham'}") 

        else:
            os.popen(f"cp {ham_path+filename} {training_path+filename+'.ham'}") 










In [11]:
spam_path = cwd+decmprsd_path + 'spam/'
ham_path = cwd+decmprsd_path + 'easy_ham/'



training_path = '/data/training/source/'
test_path = '/data/test/source/' 
create_directory(cwd, training_path)
create_directory(cwd, test_path)


split_train_test_by_id(spam_path, ham_path, 50, cwd+training_path, cwd+test_path)

In [60]:
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize


def process_text(text):
        text = re.sub(r"[^a-zA-Z0-9]", ' ', text.lower()) #alphanumeric lowercase
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stopwords.words("english")]
        tokens = [PorterStemmer().stem(w) for w in tokens]
        return tokens

def create_vocab(directory):
    texts = []
    vocab = []
    
    for file in os.listdir(directory):
        
        target = file.split('.')[-1]
        try:
            prcsd_text = process_text(open(directory+file).read())
            texts.append((target, prcsd_text))
            for token in prcsd_text:
                if token not in vocab:
                    vocab.append(token)
        except Exception as e:
            pass
    
    
    vocab.sort()
    
    return texts, vocab
    


[nltk_data] Downloading package punkt to /Users/daltonsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daltonsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
texts, vocab = create_vocab(cwd+training_path)

In [91]:


def calculate_BOW(vocab,doc):
    tf_diz = dict.fromkeys(vocab,0)
    for word in doc:
        tf_diz[word]=doc.count(word)
    return tf_diz

def create_training_data(texts, vocab):
    bows = []
    for text in texts[:10]:
        target = text[0]
        doc = text[1]
        bow = calculate_BOW(vocab, doc)
        if target == 'spam':
            bow['target_isSpam'] = 1
        else:
            bow['target_isSpam'] = 0
        bows.append(bow)
    
    df = pd.DataFrame(bows) 
    
    return df.loc[:,'target_isSpam'], df.drop('target_isSpam', axis=1)

 
        




In [92]:
y, X = create_training_data(texts,vocab)


(10,) (10, 56219)
