In [35]:
#Apache Spam Classifier

# Get the Data

In [561]:
import requests
import os
import shutil

URL_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
HAM_FILE = '20021010_easy_ham.tar.bz2'
SPAM_FILE = '20021010_spam.tar.bz2'


MAIN_PATH = os.getcwd()
SOURCE_PATH = os.path.join(MAIN_PATH, '/data/source/')

def reset_directory(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory) 

def download_spam():
    """ Download Spam/Ham sources into local directory """
    reset_directory(SOURCE_PATH)
    
    for filename in [HAM_FILE, SPAM_FILE]:
        r = requests.get(URL_ROOT + filename)
        filepath = os.path.join(SOURCE_PATH, filename)
        with open(filepath, 'wb') as destination:
            destination.write(r.content)

print(SOURCE_PATH)

/data/source/


In [562]:
download_spam()

OSError: [Errno 30] Read-only file system: '/data'

In [560]:
import tarfile

EMAILS_PATH = os.path.join(SOURCE_PATH, 'emails/')

def extract_spam():
    """ Extracts spam email files from compressed directory """
    reset_directory(EMAILS_PATH)
    
    for file in os.listdir(SOURCE_PATH):
        if file.endswith(".bz2"): 
            filepath = os.path.join(SOURCE_PATH, file)
            tar = tarfile.open(filepath, mode='r:bz2')
            tar.extractall(EMAILS_PATH)
        else:
             continue
print(EMAILS_PATH)

/data/source/emails/


In [540]:
extract_spam()

# Prepare the Data

In [558]:
import random
import glob
import re



TRAINING_PATH = os.path.join(MAIN_PATH, '/data/training/source/')
print(TRAINING_PATH)


TEST_PATH = os.path.join(MAIN_PATH, '/data/test/source/')
EMAIL_SPAM_PATH = os.path.join(EMAILS_PATH, 'spam/')
EMAIL_HAM_PATH = os.path.join(EMAILS_PATH, 'easy_ham/')

NUM_SPAM = len(os.listdir(EMAIL_SPAM_PATH))
NUM_HAM = len(os.listdir(EMAIL_HAM_PATH))

print(MAIN_PATH)
print(TRAINING_PATH)

def split_spam_train_test(seed=50, percent_test=20):
    reset_directory(TRAINING_PATH)
    reset_directory(TEST_PATH)
    random.seed(seed)
    
    spam_test_indicies = random.sample([i for i in range(NUM_SPAM)], k=int(NUM_SPAM*percent_test/100))
    ham_test_indicies = random.sample([i for i in range(NUM_HAM)], k=int(NUM_HAM*percent_test/100))
    
    for file in os.listdir(EMAIL_SPAM_PATH):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        source_path = os.path.join(EMAIL_SPAM_PATH, filename)

        if _id in spam_test_indicies:
            destination = os.path.join(TEST_PATH, filename, '.spam')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, filename, '.spam')
            os.popen(f"cp {spam_path+filename} {training_path+filename+'.spam'}") 

        
    for file in os.listdir(EMAIL_HAM_PATH):
        filename = os.fsdecode(file)
        _id = int(filename.split('.')[0])
        source_path = os.path.join(EMAIL_HAM_PATH, filename)

        if _id in ham_test_indicies:
            destination = os.path.join(TEST_PATH, filename, '.ham')
            os.popen(f"cp {source_path} {destination}") 

        else:
            destination = os.path.join(TRAINING_PATH, filename, '.ham')
            os.popen(f"cp {source_path} {training_path+filename+'.ham'}") 










/data/training/source/
/Users/daltonsi/Analysis/Projects/spam-classifier
/data/training/source/


In [549]:
print(MAIN_PATH)
print(TRAINING_PATH)
os.makedirs(TRAINING_PATH) 


#split_spam_train_test()

/Users/daltonsi/Analysis/Projects/spam-classifier
/data/training/source/


OSError: [Errno 30] Read-only file system: '/data'

In [450]:
import nltk
import pandas as pd
import email
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize


def tokenize_text(text: str, remove_stop=True, agg_ints=True, agg_leading_zeros=True, stemmer=True):
        
        text, count = re.subn(r"[^a-zA-Z0-9]", ' ', text) #alphanumeric lowercase


        tokens = word_tokenize(text)
        if agg_ints:
            tokens = [re.sub(r"^[0-9]+$", 'INTEGER', token) for token in tokens]
        if agg_leading_zeros:
            tokens = [re.sub(r"^[0]+.*", 'LEADING_ZERO', token) for token in tokens]
        if remove_stop:
            tokens = [token for token in tokens if token not in stopwords.words("english")]
        if stemmer:
            tokens = [PorterStemmer().stem(w) for w in tokens]
        return tokens

    
def parse_msg_list(msg_list: list):
    full_msg = ''
    for msg in msg_list:
        msg = msg.get_payload()
        if type(msg) == list:
            full_msg += parse_msg_list(msg)
        else:
            full_msg += msg
    return full_msg


    
def create_target_tokenset_pairs(directory: str):
    target_tokenset_pairs = []
    
    for file_path in os.listdir(directory):
        if not file_path.startswith('.'):
            target = file_path.split('.')[-1]
            msg_str = open(directory+file_path, encoding='latin1').read()
            payload = email.message_from_string(msg_str).get_payload()
            if payload == list:
                full_msg = parse_msg_list(payload)
            else:
                full_msg = payload
            try:
                msg_tokens = tokenize_text(full_msg)
            except:
                print(type(full_msg))
                print(full_msg)
                print()
            target_tokenset_pairs.append((target, msg_tokens))
    return target_tokenset_pairs



def create_wordset(list_of_target_tokensets:list):
    return sorted(list(set([token for tokenset in list_of_target_tokensets for token in tokenset])))
    
    


[nltk_data] Downloading package punkt to /Users/daltonsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daltonsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [451]:
training_target_tokenset_pairs = create_target_tokenset_pairs(cwd+training_path)
test_target_tokenset_pairs = create_target_tokenset_pairs(cwd+test_path)


training_wordset = create_wordset([t[1] for t in training_target_tokenset_pairs])

<class 'list'>
[<email.message.Message object at 0x141a73d00>, <email.message.Message object at 0x141a73b20>]

<class 'list'>
[<email.message.Message object at 0x141a73160>, <email.message.Message object at 0x141a73ca0>]

<class 'list'>
[<email.message.Message object at 0x141a73b20>]

<class 'list'>
[<email.message.Message object at 0x141a73490>, <email.message.Message object at 0x141a734c0>, <email.message.Message object at 0x141a73fd0>]

<class 'str'>

[can't think of how I'd be running
afoul of the spam filters with this
post, so here's the second try...]

Kissinger's book _Does America Need
a Foreign Policy?_ provides a few
handy abstractions:

> The ultimate dilemma of the statesman is to strike a balance between
> values ["idealism"] and interests ["realism"] and, occasionally,
> between peace and justice.

Also, he views historical American
approaches to foreign policy as a
bundle of three fibers:

Hamiltonian - We should only get
  involved in foreign adventures
  to preserve b

<class 'list'>
[<email.message.Message object at 0x1416dcaf0>, <email.message.Message object at 0x141a736d0>]

<class 'list'>
[<email.message.Message object at 0x1416e7df0>, <email.message.Message object at 0x141a734f0>]

<class 'list'>
[<email.message.Message object at 0x1416dcaf0>, <email.message.Message object at 0x141a734c0>]

<class 'list'>
[<email.message.Message object at 0x1416e7850>, <email.message.Message object at 0x141a734f0>]

<class 'list'>
[<email.message.Message object at 0x1416e7520>]

<class 'list'>
[<email.message.Message object at 0x1416dcaf0>, <email.message.Message object at 0x141a734f0>]

<class 'list'>
[<email.message.Message object at 0x1416e7df0>]

<class 'list'>
[<email.message.Message object at 0x1416e7520>]

<class 'list'>
[<email.message.Message object at 0x1416e7f70>, <email.message.Message object at 0x141a734f0>]

<class 'list'>
[<email.message.Message object at 0x1416e7f70>]

<class 'list'>
[<email.message.Message object at 0x1416e7df0>]

<class 'list'>

<class 'list'>
[<email.message.Message object at 0x1416e7af0>, <email.message.Message object at 0x141a734f0>]

<class 'list'>
[<email.message.Message object at 0x1416e7df0>, <email.message.Message object at 0x141a73ca0>]

<class 'list'>
[<email.message.Message object at 0x1416e7f70>, <email.message.Message object at 0x141a73ca0>]

<class 'list'>
[<email.message.Message object at 0x1416e7520>, <email.message.Message object at 0x141a73ca0>]

<class 'list'>
[<email.message.Message object at 0x1416e7df0>, <email.message.Message object at 0x141a738e0>, <email.message.Message object at 0x141a73ca0>]

<class 'list'>
[<email.message.Message object at 0x1416dcaf0>]

<class 'list'>
[<email.message.Message object at 0x141a73490>]

<class 'list'>
[<email.message.Message object at 0x1416dcac0>, <email.message.Message object at 0x141a73fd0>]

<class 'list'>
[<email.message.Message object at 0x1416e7850>, <email.message.Message object at 0x141a73490>]

<class 'list'>
[<email.message.Message object at

In [421]:
training_wordset

['100',
 '100802b',
 '100gb',
 '100k',
 '100mbp',
 '100x',
 '1010ff',
 '101a',
 '101aug',
 '101pbkg3860zion2',
 '1024x',
 '1024x768',
 '102a',
 '102aug',
 '1035kb',
 '103a',
 '103aug',
 '104a',
 '104vokj8349uvae9',
 '105th',
 '10798aa48d25b3f61778f379964a57e5',
 '108b',
 '10acpi',
 '10adc',
 '10c7',
 '10frie',
 '10gig',
 '10k',
 '10lz2e2pp',
 '10mb',
 '10p',
 '10pt',
 '10px',
 '10smp',
 '10str',
 '10t02',
 '10t03',
 '10th',
 '10wpm',
 '10xit6txslh4q39nbmrq8kesb7ccfo3t',
 '11',
 '111mph',
 '1120a7d868b23e83b91ad00ec8b79e08',
 '112b010a30255d7d14ee9465d4fe804c',
 '112th',
 '1139xcyb6',
 '113b',
 '113ec7122d4046a2754bcf70b9fb5299',
 '115200n81',
 '115mhzg9267xwnfl40',
 '115px',
 '115qsjm1944hylu8',
 '116cjyf2258aegj1',
 '118c',
 '119a135',
 '11a',
 '11a23',
 '11am',
 '11b',
 '11bc31540055c320b62e5886ef27c4b2',
 '11c3',
 '11cf',
 '11ge8kmidapp7fcz',
 '11honi2cticrkqiyx',
 '11k',
 '11mb',
 '11mbp',
 '11mxwo6z',
 '11pm',
 '11pt',
 '11px',
 '11t22',
 '11taoxofzjh',
 '11th',
 '1200bp',
 '1245e

In [152]:


def calculate_BOW(vocab:list, tokens: list):
    bow_dict = dict.fromkeys(vocab,0)
    for token in tokens:
        bow_dict[token]=tokens.count(token)
    return bow_dict

def create_bow_df(target_token_pairs:list, vocab:list):
    bows = []
    for pair in target_token_pairs[:10]:
        target = pair[0]
        tokens = pair[1]
        bow = calculate_BOW(vocab, tokens)
        if target == 'spam':
            bow['target_isSpam'] = 1
        else:
            bow['target_isSpam'] = 0
        bows.append(bow)
    
    df = pd.DataFrame(bows) 
    
    return df.loc[:,'target_isSpam'], df.drop('target_isSpam', axis=1)

 
        




In [131]:
y_train, X_train = create_bow_df(training_target_tokenset_pairs, vocab)
y_test, X_test = create_bow_df(test_target_tokenset_pairs, vocab)





KeyError: 'target_isSpam'

In [125]:
training_path = '/data/training/processed/'
test_path = '/data/test/processed/' 
create_directory(cwd, training_path)
create_directory(cwd, test_path)


y_train.to_csv(cwd+training_path+'y_train.csv')
X_train.to_csv(cwd+training_path+'X_train.csv')

X_test.to_csv(cwd+test_path+'X_test.csv')
y_test.to_csv(cwd+test_path+'y_test.csv')

TypeError: 'tuple' object is not callable

Unnamed: 0,0,00,000,0000,00000,000000,000000000,0000006f,00000073,00000094,...,zzzzail,zzzzason,zzzzcard,zzzzcc,zzzzn,zzzzrubin,zzzzteana,zzzzv,zzzzvf,zzzzzv
0,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,8,1,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
7,8,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,7,11,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0


Unnamed: 0,0,00,000,0000,00000,000000,000000000,0000006f,00000073,00000094,...,zzzzail,zzzzason,zzzzcard,zzzzcc,zzzzn,zzzzrubin,zzzzteana,zzzzv,zzzzvf,zzzzzv
0,10,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25,11,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,10,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,12,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,6,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,11,6,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
