In [1]:
#Apache Spam Classifier

# Get the Data

In [18]:
import requests
import os
import shutil

URL_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
HAM_FILE = '20021010_easy_ham.tar.bz2'
SPAM_FILE = '20021010_spam.tar.bz2'


MAIN_PATH = os.getcwd()
SOURCE_PATH = os.path.join(MAIN_PATH, 'data/source/')

def reset_directory(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory) 

def download_spam():
    """ Download Spam/Ham sources into local directory """
    reset_directory(SOURCE_PATH)
    
    for filename in [HAM_FILE, SPAM_FILE]:
        r = requests.get(URL_ROOT + filename)
        filepath = os.path.join(SOURCE_PATH, filename)
        with open(filepath, 'wb') as destination:
            destination.write(r.content)



In [3]:
download_spam()

In [19]:
import tarfile

EMAILS_PATH = os.path.join(SOURCE_PATH, 'emails/')

def extract_spam():
    """ Extracts spam email files from compressed directory """
    reset_directory(EMAILS_PATH)
    
    for file in os.listdir(SOURCE_PATH):
        if file.endswith(".bz2"): 
            filepath = os.path.join(SOURCE_PATH, file)
            tar = tarfile.open(filepath, mode='r:bz2')
            tar.extractall(EMAILS_PATH)
        else:
             continue


In [5]:
extract_spam()

# Prepare the Data

In [22]:
import random
import glob
import re


TRAINING_PATH = os.path.join(MAIN_PATH, 'data/training/source/')
TEST_PATH = os.path.join(MAIN_PATH, 'data/test/source/')

EMAIL_SPAM_PATH = os.path.join(EMAILS_PATH, 'spam/')
EMAIL_HAM_PATH = os.path.join(EMAILS_PATH, 'easy_ham/')

NUM_SPAM = len(os.listdir(EMAIL_SPAM_PATH))
NUM_HAM = len(os.listdir(EMAIL_HAM_PATH))



def split_spam_train_test(seed=50, percent_test=20):
    """ Splits spam email files into test/train directories """
    reset_directory(TRAINING_PATH)
    reset_directory(TEST_PATH)
    random.seed(seed)
    
    spam_test_indcs = random.sample([i for i in range(NUM_SPAM)], k=int(NUM_SPAM*percent_test/100))
    ham_test_indcs = random.sample([i for i in range(NUM_HAM)], k=int(NUM_HAM*percent_test/100))
    
    for file in os.listdir(EMAIL_SPAM_PATH):
        _id = int(file.split('.')[0])
        source_path = os.path.join(EMAIL_SPAM_PATH, file)

        if _id in spam_test_indcs:
            destination = os.path.join(TEST_PATH, file + '.spam')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, file + '.spam')
            os.popen(f"cp {source_path} {destination}") 

        
    for file in os.listdir(EMAIL_HAM_PATH):
        _id = int(file.split('.')[0])
        source_path = os.path.join(EMAIL_HAM_PATH, file)

        if _id in ham_test_indcs:
            destination = os.path.join(TEST_PATH, file + '.ham')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, file + '.ham')
            os.popen(f"cp {source_path} {destination}") 




In [23]:
split_spam_train_test()

In [50]:
import nltk
import pandas as pd
import email
import urlextract
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')




# REF: https://wkirgsn.github.io/2018/02/15/pandas-pipelines/    
class SpamTokenCounter(BaseEstimator, TransformerMixin):
    """ Transforms each email of a spam or ham directory into a Counter of counted tokens """
    def __init__(self, remove_urls=True, remove_stop=True, agg_ints=True, agg_leading_zeros=True, stemmer=True):
        self.remove_urls = remove_urls
        self.remove_stop = remove_stop
        self.agg_ints = agg_ints
        self.agg_leading_zeros = agg_leading_zeros
        self.stemmer = stemmer
        return None

    def fit(self, X, y=None):
        return self
    
    def process_content(self, mail_objc):
        part_type = mail_objc.get_content_type()
        if part_type == 'text/plain':
            return mail_objc.get_payload()
        elif part_type == 'text/html':
            html = mail_objc.get_payload()
            soup = BeautifulSoup(html, features="html.parser")
            for script in soup(["script", "style"]):
                script.extract() 
            if soup.body == None:
                return ''
            return soup.body.get_text()
        else:
            return ''
    
    def retrieve_text_content(self, x_msg):
         if x_msg.is_multipart():
            prcssd_msg = ''
            for msg in x_msg.walk():
                payload = self.process_content(msg)
                if payload:
                    prcssd_msg += payload
            
            return prcssd_msg
         else:
            return self.process_content(x_msg)       
  
    def tokenize_and_count(self, text: str):
        
        if self.remove_urls:
            url_extractor = urlextract.URLExtract()
            urls = list(set(url_extractor.find_urls(text)))
            for url in urls:
                text = text.replace(url, " URL ")
        
        text = re.sub(r"[^a-zA-Z0-9]", ' ', text) #alphanumeric lowercase
        tokens = word_tokenize(text)
        if self.agg_ints:
            tokens = [re.sub(r"^[0-9]+$", 'INTEGER', token) for token in tokens]
        if self.agg_leading_zeros:
            tokens = [re.sub(r"^[0]+.*", 'LEADING_ZERO', token) for token in tokens]
        if self.remove_stop:
            tokens = [token for token in tokens if token not in stopwords.words("english")]
        if self.stemmer:
            tokens = [PorterStemmer().stem(w) for w in tokens]

        return Counter(tokens)  


    
    def transform(self, X):
        X_transformed = []
        for file in os.listdir(X):
            if not file.startswith('.'):
                isSpam = file.split('.')[-1] == 'spam'
                msg_objc = email.message_from_string(open(os.path.join(X,file), encoding='latin1').read())
                msg_content = self.retrieve_text_content(msg_objc)
                token_counter = self.tokenize_and_count(msg_content)                
                X_transformed.append([isSpam,token_counter])
        return X_transformed


            

[nltk_data] Downloading package punkt to /Users/daltonsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daltonsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
spam_token_counter = SpamTokenCounter()
training_spam_token_counters = spam_token_counter.fit_transform(TRAINING_PATH)
test_spam_token_counters = spam_token_counter.fit_transform(TEST_PATH)

In [49]:
spam_token_counters[:10]

[[False,
  Counter({'how': 1,
           'easi': 2,
           'would': 1,
           'code': 4,
           'menu': 2,
           'item': 1,
           'mark': 1,
           'read': 3,
           'messag': 5,
           'current': 2,
           'i': 9,
           'often': 1,
           'problem': 1,
           'heavi': 1,
           'traffic': 1,
           'mail': 2,
           'list': 3,
           'exmh': 11,
           'want': 2,
           'subset': 1,
           'lose': 1,
           'track': 1,
           'particular': 2,
           'folder': 2,
           'thi': 1,
           'could': 1,
           'also': 1,
           'gener': 1,
           'new': 1,
           'sequenc': 2,
           'way': 1,
           'remov': 1,
           'first': 1,
           'excus': 1,
           'still': 2,
           'use': 1,
           'integ': 7,
           'may': 1,
           'play': 1,
           'well': 1,
           'recent': 1,
           'chang': 1,
           'look': 2,
           'and

In [158]:
# REF: https://wkirgsn.github.io/2018/02/15/pandas-pipelines/    
class BOWTransformer(BaseEstimator, TransformerMixin):
    """ Transforms each email of a spam or ham directory into a Counter of counted tokens """
    def __init__(self, vocab=None, vocab_size=1000, min_count=7):
        self.vocab = vocab
        self.vocab_size = vocab_size
        self.min_count = min_count
        return None

    def fit(self, X, y=None):
        return self

    def create_vocab(self, counters):
        
        full_vocab = sorted(sum(counters, Counter()).items(), key=lambda item: item[1], reverse=True)
        full_vocab_notail = [w[0] for w in full_vocab if w[1] >= self.min_count]
        vocab_size = min(self.vocab_size, len(full_vocab_notail))
        return full_vocab_notail[:vocab_size]

    def calculate_BOW(self, tokens: Counter):
        bow_dict = dict.fromkeys(self.vocab,0)
        for count, token in tokens.items():
            if token in bow_dict:
                bow_dict[token] += count
        return bow_dict

    
    def transform(self, X):
        bows = []
        
        if self.vocab == None:
            self.vocab = self.create_vocab([x[1] for x in X])

        
        for x in X:
            counter = x[1]
            bow = self.calculate_BOW(counter)
            bow['target_isSpam'] = x[0]
            bows.append(bow)

        df = pd.DataFrame(bows) 
        y, X = df.loc[:,'target_isSpam'], df.drop('target_isSpam', axis=1)
        return y, X
        

         

        
        
        

In [157]:
training_bow_transformer = BOWTransformer()
y_train, X_train = training_bow_transformer.fit_transform(training_spam_token_counters)

test_bow_transformer = BOWTransformer(vocab=list(X_train.columns))
y_test, X_test = test_bow_transformer.fit_transform(test_spam_token_counters)




<class 'NoneType'>
<class 'list'>


In [163]:
X_train

Unnamed: 0,integ,i,url,the,list,use,mail,get,one,time,...,player,cat,pattern,replac,declar,choos,challeng,washington,rebuild,avoid
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0       False
1       False
2       False
3       False
4       False
        ...  
2437     True
2438    False
2439    False
2440    False
2441    False
Name: target_isSpam, Length: 2442, dtype: bool

['10acpi',
 '10t03',
 '11b',
 '11th',
 '1990',
 '19th',
 '1mb',
 '1pttran',
 '1st',
 '2c',
 '2e',
 '2nd',
 '3d',
 '3g',
 '3k',
 '3rd',
 '4cnc7',
 '4th',
 '5a',
 '5th',
 '6b',
 '8e',
 '8th',
 '90',
 'a',
 'a0',
 'a1',
 'a2',
 'a3',
 'a4',
 'a52',
 'aa',
 'aachen',
 'aalib',
 'aaxin',
 'ab',
 'abandon',
 'abidjan',
 'abil',
 'abiword',
 'abl',
 'abonn',
 'about',
 'abroad',
 'absenc',
 'absolut',
 'abstract',
 'abus',
 'ac',
 'academ',
 'academi',
 'acceler',
 'accept',
 'access',
 'accid',
 'accommod',
 'accompagn',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'achiev',
 'acknowledg',
 'acm',
 'acquir',
 'acquisit',
 'across',
 'act',
 'action',
 'activ',
 'activist',
 'actual',
 'acuit',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'addr',
 'address',
 'adequ',
 'adjust',
 'admin',
 'administr',
 'admiss',
 'admit',
 'adopt',
 'adress',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'advers',
 'adversari',

Unnamed: 0,10acpi,10t03,11b,11th,1990,19th,1mb,1pttran,1st,2c,...,zdnet,zealand,zealot,zero,ziggi,zimbabw,zip,zone,zope,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
