In [35]:
#Apache Spam Classifier

# Get the Data

In [6]:
import requests
import os
from tqdm import tqdm

url = 'https://spamassassin.apache.org/old/publiccorpus/'
files = ['20021010_easy_ham.tar.bz2','20021010_spam.tar.bz2']
    

    
def create_directory(cwd: str, path_extension: str):
    new_path = cwd + path_extension
    if not os.path.exists(new_path):
        os.makedirs(new_path)


        
def dl_from_URL(urls: list, target_path: str):
    print(f"Start: {len(urls)} files to download")
    
    for idx, url in tqdm(enumerate(urls)):
        filename = url.split('/')[-1]
        destination_path = target_path + filename
        r = requests.get(url)
        with open(destination_path, 'wb') as output_file:
            output_file.write(r.content)
        




In [7]:
cwd = os.getcwd()
source_path = '/data/source/'
DATA_PATH = source_path 
urls = [url + file for file in files]



create_directory(cwd, source_path)
dl_from_URL(urls, cwd+DATA_PATH)

Start: 2 files to download


2it [00:03,  1.90s/it]


In [8]:
import tarfile

def decompress_data(source_path: str, decmprsd_path: str):
    
    directory = os.fsencode(source_path)

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".bz2"): 
            tar = tarfile.open(source_path+filename, mode='r:bz2')
            tar.extractall(decmprsd_path)
        else:
             continue


In [9]:
decmprsd_path = '/data/decompressed/'
create_directory(cwd, decmprsd_path)
decompress_data(cwd+DATA_PATH, cwd+decmprsd_path)

# Prepare the Data

In [10]:
import random
import glob
import shutil
import re




def split_train_test_by_id(spam_path: str, ham_path: str, seed: int, training_path: str, test_path: str ):
    random.seed(seed)
    num_spam = len(os.listdir(spam_path))
    num_ham = len(os.listdir(ham_path))
    spam_test_indicies = random.sample([i for i in range(num_spam)], k=int(num_spam*.2))
    ham_test_indicies = random.sample([i for i in range(num_ham)], k=int(num_ham*.2))


    paths = [test_path, training_path]
    for path in paths:
        shutil.rmtree(path)
        create_directory('', path)
    
    for file in os.listdir(spam_path):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        if _id in spam_test_indicies:
            os.popen(f"cp {spam_path+filename} {test_path+filename+'.spam'}") 

        else:
            os.popen(f"cp {spam_path+filename} {training_path+filename+'.spam'}") 

        
    for file in os.listdir(ham_path):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        if _id in ham_test_indicies:
            os.popen(f"cp {ham_path+filename} {test_path+filename+'.ham'}") 

        else:
            os.popen(f"cp {ham_path+filename} {training_path+filename+'.ham'}") 










In [11]:
spam_path = cwd+decmprsd_path + 'spam/'
ham_path = cwd+decmprsd_path + 'easy_ham/'



training_path = '/data/training/source/'
test_path = '/data/test/source/' 
create_directory(cwd, training_path)
create_directory(cwd, test_path)


split_train_test_by_id(spam_path, ham_path, 50, cwd+training_path, cwd+test_path)

In [109]:
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize


def tokenize_text(text: str, stop=True, stemmer=True):
        text = re.sub(r"[^a-zA-Z0-9]", ' ', text.lower()) #alphanumeric lowercase
        tokens = word_tokenize(text)
        if stop:
            tokens = [word for word in tokens if word not in stopwords.words("english")]
        if stemmer:
            tokens = [PorterStemmer().stem(w) for w in tokens]
        return tokens

def create_target_tokenset_pairs(directory: str):
    target_tokenset_pairs = []
    
    for file in os.listdir(directory):
        
        target = file.split('.')[-1]
        try:
            prcsd_text = tokenize_text(open(directory+file).read())
            target_tokenset_pairs.append((target, prcsd_text))
        except Exception as e:
            pass

    return target_tokenset_pairs


def create_wordset(list_of_target_tokensets:list):
    return sorted(list(set([token for tokenset in list_of_target_tokensets for token in tokenset])))
    
    


[nltk_data] Downloading package punkt to /Users/daltonsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daltonsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
training_target_tokenset_pairs = create_target_tokenset_pairs(cwd+training_path)
test_target_tokenset_pairs = create_target_tokenset_pairs(cwd+test_path)


training_wordset = create_wordset([t[1] for t in training_target_tokenset_pairs])

In [115]:


def calculate_BOW(vocab:list, tokens: list):
    tf_diz = dict.fromkeys(vocab,0)
    for token in tokens:
        tf_diz[token]=tokens.count(token)
    return tf_diz

def create_bow_df(target_token_pairs:list, vocab:list):
    bows = []
    for pair in target_token_pairs[:10]:
        target = pair[0]
        tokens = pair[1]
        bow = calculate_BOW(vocab, tokens)
        if target == 'spam':
            bow['target_isSpam'] = 1
        else:
            bow['target_isSpam'] = 0
        bows.append(bow)
    
    df = pd.DataFrame(bows) 
    
    return df.loc[:,'target_isSpam'], df.drop('target_isSpam', axis=1)

 
        




In [116]:
y_train, X_train = create_bow_df(training_target_tokenset_pairs, vocab)
y_test, X_test = create_bow_df(test_target_tokenset_pairs, vocab)





In [125]:
training_path = '/data/training/processed/'
test_path = '/data/test/processed/' 
create_directory(cwd, training_path)
create_directory(cwd, test_path)


y_train.to_csv(cwd+training_path+'y_train.csv')
X_train.to_csv(cwd+training_path+'X_train.csv')

X_test.to_csv(cwd+test_path+'X_test.csv')
y_test.to_csv(cwd+test_path+'y_test.csv')

TypeError: 'tuple' object is not callable

Unnamed: 0,0,00,000,0000,00000,000000,000000000,0000006f,00000073,00000094,...,zzzzail,zzzzason,zzzzcard,zzzzcc,zzzzn,zzzzrubin,zzzzteana,zzzzv,zzzzvf,zzzzzv
0,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,8,1,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
7,8,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,7,11,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0


Unnamed: 0,0,00,000,0000,00000,000000,000000000,0000006f,00000073,00000094,...,zzzzail,zzzzason,zzzzcard,zzzzcc,zzzzn,zzzzrubin,zzzzteana,zzzzv,zzzzvf,zzzzzv
0,10,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25,11,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,10,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,12,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,6,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,11,6,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
