In [1]:
#Apache Spam Classifier

# Get the Data

In [18]:
import requests
import os
import shutil

URL_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
HAM_FILE = '20021010_easy_ham.tar.bz2'
SPAM_FILE = '20021010_spam.tar.bz2'


MAIN_PATH = os.getcwd()
SOURCE_PATH = os.path.join(MAIN_PATH, 'data/source/')

def reset_directory(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory) 

def download_spam():
    """ Download Spam/Ham sources into local directory """
    reset_directory(SOURCE_PATH)
    
    for filename in [HAM_FILE, SPAM_FILE]:
        r = requests.get(URL_ROOT + filename)
        filepath = os.path.join(SOURCE_PATH, filename)
        with open(filepath, 'wb') as destination:
            destination.write(r.content)



In [3]:
download_spam()

In [19]:
import tarfile

EMAILS_PATH = os.path.join(SOURCE_PATH, 'emails/')

def extract_spam():
    """ Extracts spam email files from compressed directory """
    reset_directory(EMAILS_PATH)
    
    for file in os.listdir(SOURCE_PATH):
        if file.endswith(".bz2"): 
            filepath = os.path.join(SOURCE_PATH, file)
            tar = tarfile.open(filepath, mode='r:bz2')
            tar.extractall(EMAILS_PATH)
        else:
             continue


In [5]:
extract_spam()

# Prepare the Data

In [22]:
import random
import glob
import re


TRAINING_PATH = os.path.join(MAIN_PATH, 'data/training/source/')
TEST_PATH = os.path.join(MAIN_PATH, 'data/test/source/')

EMAIL_SPAM_PATH = os.path.join(EMAILS_PATH, 'spam/')
EMAIL_HAM_PATH = os.path.join(EMAILS_PATH, 'easy_ham/')

NUM_SPAM = len(os.listdir(EMAIL_SPAM_PATH))
NUM_HAM = len(os.listdir(EMAIL_HAM_PATH))



def split_spam_train_test(seed=50, percent_test=20):
    """ Splits spam email files into test/train directories """
    reset_directory(TRAINING_PATH)
    reset_directory(TEST_PATH)
    random.seed(seed)
    
    spam_test_indcs = random.sample([i for i in range(NUM_SPAM)], k=int(NUM_SPAM*percent_test/100))
    ham_test_indcs = random.sample([i for i in range(NUM_HAM)], k=int(NUM_HAM*percent_test/100))
    
    for file in os.listdir(EMAIL_SPAM_PATH):
        _id = int(file.split('.')[0])
        source_path = os.path.join(EMAIL_SPAM_PATH, file)

        if _id in spam_test_indcs:
            destination = os.path.join(TEST_PATH, file + '.spam')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, file + '.spam')
            os.popen(f"cp {source_path} {destination}") 

        
    for file in os.listdir(EMAIL_HAM_PATH):
        _id = int(file.split('.')[0])
        source_path = os.path.join(EMAIL_HAM_PATH, file)

        if _id in ham_test_indcs:
            destination = os.path.join(TEST_PATH, file + '.ham')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, file + '.ham')
            os.popen(f"cp {source_path} {destination}") 




In [23]:
split_spam_train_test()

In [None]:
import nltk
import pandas as pd
import email
import urlextract
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')




# REF: https://wkirgsn.github.io/2018/02/15/pandas-pipelines/    
class SpamTokenCounter(BaseEstimator, TransformerMixin):
    def __init__(self, remove_urls=True, remove_stop=True, agg_ints=True, agg_leading_zeros=True, stemmer=True):
        self.remove_urls = remove_urls
        self.remove_stop = remove_stop
        self.agg_ints = agg_ints
        self.agg_leading_zeros = agg_leading_zeros
        self.stemmer = stemmer
        return None

    def fit(self, X, y=None):
        return self
    
    def process_content(self, mail_objc):
        part_type = mail_objc.get_content_type()
        if part_type == 'text/plain':
            return mail_objc.get_payload()
        elif part_type == 'text/html':
            html = mail_objc.get_payload()
            soup = BeautifulSoup(html, features="html.parser")
            for script in soup(["script", "style"]):
                script.extract() 
            if soup.body == None:
                return ''
            return soup.body.get_text()
        else:
            return ''
    
    def retrieve_text_content(self, x_msg):
         if x_msg.is_multipart():
            prcssd_msg = ''
            for msg in x_msg.walk():
                payload = self.process_content(msg)
                if payload:
                    prcssd_msg += payload
            
            return prcssd_msg
         else:
            return self.process_content(x_msg)       
  
    def tokenize_and_count(self, text: str):
        
        if self.remove_urls:
            url_extractor = urlextract.URLExtract()
            urls = list(set(url_extractor.find_urls(text)))
            for url in urls:
                text = text.replace(url, " URL ")
        
        text = re.sub(r"[^a-zA-Z0-9]", ' ', text) #alphanumeric lowercase
        tokens = word_tokenize(text)
        if self.agg_ints:
            tokens = [re.sub(r"^[0-9]+$", 'INTEGER', token) for token in tokens]
        if self.agg_leading_zeros:
            tokens = [re.sub(r"^[0]+.*", 'LEADING_ZERO', token) for token in tokens]
        if self.remove_stop:
            tokens = [token for token in tokens if token not in stopwords.words("english")]
        if self.stemmer:
            tokens = [PorterStemmer().stem(w) for w in tokens]

        return Counter(tokens)  


    
    def transform(self, X):
        X_transformed = []
        for file in os.listdir(X):
            if not file.startswith('.'):
                isSpam = file.split('.')[-1] == 'spam'
                msg_objc = email.message_from_string(open(os.path.join(X,file), encoding='latin1').read())
                msg_content = self.retrieve_text_content(msg_objc)
                token_counter = self.tokenize_and_count(msg_content)                
                X_transformed.append([isSpam,token_counter])
        return X_transformed


            

In [None]:
spam_token_counters = SpamTokenCounter().fit_transform(TRAINING_PATH)

In [45]:
spam_token_counters[:10]

[Counter({'how': 1,
          'easi': 2,
          'would': 1,
          'code': 4,
          'menu': 2,
          'item': 1,
          'mark': 1,
          'read': 3,
          'messag': 5,
          'current': 2,
          'i': 9,
          'often': 1,
          'problem': 1,
          'heavi': 1,
          'traffic': 1,
          'mail': 2,
          'list': 3,
          'exmh': 11,
          'want': 2,
          'subset': 1,
          'lose': 1,
          'track': 1,
          'particular': 2,
          'folder': 2,
          'thi': 1,
          'could': 1,
          'also': 1,
          'gener': 1,
          'new': 1,
          'sequenc': 2,
          'way': 1,
          'remov': 1,
          'first': 1,
          'excus': 1,
          'still': 2,
          'use': 1,
          'integ': 7,
          'may': 1,
          'play': 1,
          'well': 1,
          'recent': 1,
          'chang': 1,
          'look': 2,
          'and': 1,
          'top': 1,
          'head': 2,
      

In [168]:
import nltk
import pandas as pd
import email
import urlextract
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize



class EmailParser():
    """ Class to load and process spam email content """
    def __init__(self, file_path):
        self.is_spam = self.check_Spam(file_path.split('.')[-1])
        self.mail_objc = self.open_mail_objc(file_path)
        
    
    def check_Spam(self, spam_string):
        return spam_string == 'spam'
    
    def open_mail_objc(self, file_path):
        msg_str = open(file_path, encoding='latin1').read()
        return email.message_from_string(msg_str)
        
    def process_content(self, mail_objc):
        part_type = mail_objc.get_content_type()
        if part_type == 'text/plain':
            return mail_objc.get_payload()
        elif part_type == 'text/html':
            html = mail_objc.get_payload()
            soup = BeautifulSoup(html, features="html.parser")
            for script in soup(["script", "style"]):
                script.extract() 
            if soup.body == None:
                return ''
            text = soup.body.get_text()
            return text
        else:
            return ''
        
        
    def process_email(self):
        if self.mail_objc.is_multipart():
            prcssd_msg = ''
            for msg in self.mail_objc.walk():
                
                payload = self.process_content(msg)
                if payload:
                    prcssd_msg += payload
            
            return self.is_spam, prcssd_msg
        else:
            return self.is_spam, self.process_content(self.mail_objc)
            
    
def tokenize_text(text: str, remove_urls=True, remove_stop=True, agg_ints=True, agg_leading_zeros=True, stemmer=True):
        
        if remove_urls:
            url_extractor = urlextract.URLExtract()
            urls = list(set(url_extractor.find_urls(text)))
            for url in urls:
                text = text.replace(url, " URL ")
        
        text, count = re.subn(r"[^a-zA-Z0-9]", ' ', text) #alphanumeric lowercase
        tokens = word_tokenize(text)
        if agg_ints:
            tokens = [re.sub(r"^[0-9]+$", 'INTEGER', token) for token in tokens]
        if agg_leading_zeros:
            tokens = [re.sub(r"^[0]+.*", 'LEADING_ZERO', token) for token in tokens]
        if remove_stop:
            tokens = [token for token in tokens if token not in stopwords.words("english")]
        if stemmer:
            tokens = [PorterStemmer().stem(w) for w in tokens]
        
        return tokens           


def create_spam_token_sets(directory):
    spam_token_sets = []
    
    for file in os.listdir(directory):
        if not file.startswith('.'):
            file_path = os.path.join(directory + file)
            isSpam, content = EmailParser(file_path).process_email()
            tokens = tokenize_text(content)
            spam_token_sets.append([isSpam,tokens])
    return spam_token_sets





[nltk_data] Downloading package punkt to /Users/daltonsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daltonsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [148]:
training_tokensets = create_spam_token_sets(TRAINING_PATH)
test_tokensets = create_spam_token_sets(TEST_PATH)

mv 1 00001.bfc8d64d12b325ff385cca8d07b84288
mv 10 00010.7f5fb525755c45eb78efc18d7c9ea5aa
mv 100 00100.c60d1c697136b07c947fa180ba3e0441
mv 101 00101.2dfd7ee79ae439b8d9c38e783a137efa
mv 102 00102.2e3969075728dde7a328e05d19b35976
mv 103 00103.8c39bfed2079f865e9dfb75f4416a468
mv 104 00104.886f4a22362f4d3528c3e675878f17f7
mv 105 00105.9790e1c57fcbf7885b7cd1719fb4681b
mv 106 00106.fa6df8609cebb6f0f37aec3f70aa5b9a
mv 107 00107.f1d4194b57840ea6587b9a73ed88e075
mv 108 00108.4506c2ef846b80b9a7beb90315b22701
mv 109 00109.601a9cd8272f22236b27e95dbe2fa22d
mv 11 00011.2a1247254a535bac29c476b86c708901
mv 110 00110.20934dc65c9a88fc9c6afda9952ce2c5
mv 111 00111.a163d41592b3a52747d7521341a961af
mv 112 00112.ec411d26d1f4decc16af7ef73e69a227
mv 113 00113.ff113297f0ed07536d288c7b2193a8ec
mv 114 00114.c104ada3a249e1e1846c0cd156a303e9
mv 115 00115.d7c257361675ee5d45baa552205fb472
mv 116 00116.8e13644b995f98dbab198b71e26f67ec
mv 117 00117.33011fddf61efe5f453a14468ff7e629
mv 118 00118.4be8b50c2a818c62b62e70c4b

In [175]:
from collections import Counter


def create_wordset(training_tokensets:list, min_token_count=7):
    list_of_tokensets = [t[1] for t in training_tokensets]
    token_counter =  Counter(list([token for tokenset in list_of_tokensets for token in tokenset]))
    return sorted(key for key, value in token_counter.items() if value >= min_token_count)



In [176]:
training_wordset = create_wordset(training_tokensets)

In [178]:
training_wordset

['10acpi',
 '10t03',
 '11b',
 '11th',
 '1990',
 '19th',
 '1mb',
 '1pttran',
 '1st',
 '2c',
 '2e',
 '2nd',
 '3d',
 '3g',
 '3k',
 '3rd',
 '4cnc7',
 '4th',
 '5a',
 '5th',
 '6b',
 '8e',
 '8th',
 '90',
 'a',
 'a0',
 'a1',
 'a2',
 'a3',
 'a4',
 'a52',
 'aa',
 'aachen',
 'aalib',
 'aaxin',
 'ab',
 'abandon',
 'abidjan',
 'abil',
 'abiword',
 'abl',
 'abonn',
 'about',
 'abroad',
 'absenc',
 'absolut',
 'abstract',
 'abus',
 'ac',
 'academ',
 'academi',
 'acceler',
 'accept',
 'access',
 'accid',
 'accommod',
 'accompagn',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'achiev',
 'acknowledg',
 'acm',
 'acquir',
 'acquisit',
 'across',
 'act',
 'action',
 'activ',
 'activist',
 'actual',
 'acuit',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'addr',
 'address',
 'adequ',
 'adjust',
 'admin',
 'administr',
 'admiss',
 'admit',
 'adopt',
 'adress',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'advers',
 'adversari',

In [186]:

def calculate_BOW(tokens: list, wordset:list):
    bow_dict = dict.fromkeys(wordset,0)
    for token in tokens:
        if token in bow_dict:
            bow_dict[token]=tokens.count(token)
    return bow_dict

def create_bow_df(target_token_pairs:list, wordset:list):
    bows = []
    for pair in target_token_pairs[:10]:
        target = pair[0]
        tokens = pair[1]
        bow = calculate_BOW(tokens, wordset)
        if target:
            bow['target_isSpam'] = 1
        else:
            bow['target_isSpam'] = 0
        bows.append(bow)
    
    df = pd.DataFrame(bows) 
    
    return df.loc[:,'target_isSpam'], df.drop('target_isSpam', axis=1)

 
        




In [187]:
y_train, X_train = create_bow_df(training_tokensets, training_wordset)
y_test, X_test = create_bow_df(test_tokensets, training_wordset)




In [189]:
X_train


Unnamed: 0,10acpi,10t03,11b,11th,1990,19th,1mb,1pttran,1st,2c,...,zdnet,zealand,zealot,zero,ziggi,zimbabw,zip,zone,zope,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
training_path = '/data/training/processed/'
test_path = '/data/test/processed/' 
create_directory(cwd, training_path)
create_directory(cwd, test_path)


y_train.to_csv(cwd+training_path+'y_train.csv')
X_train.to_csv(cwd+training_path+'X_train.csv')

X_test.to_csv(cwd+test_path+'X_test.csv')
y_test.to_csv(cwd+test_path+'y_test.csv')