### We will use lingspam corpus for our mail spam filtering.
#### Let's start with function to get the data.

In [1]:
import pandas as pd
import os
import math
import re

def get_directory_path( set_lemmatizer : bool, use_stop_list: bool ):
    base_path = './lingspam_public/'
    directory_map = { (False, False):'bare',(True, False):'lemm',
              (True, True):'lemm_stop',(False,True):'stop' };
    directory = directory_map[(set_lemmatizer, use_stop_list)];
    return base_path + directory;

def mail_content_to_word_freq_map( email_body : str ):
    mail_content_to_word_f = {}
    for word in email_body.split(' '):
        if not ( word in mail_content_to_word_f.keys() ):
            mail_content_to_word_f[ word ] = 0;
        mail_content_to_word_f[ word ] = mail_content_to_word_f[ word ] + 1
    return mail_content_to_word_f;

def get_datasets( set_lemmatizer : bool, use_stop_list : bool, include_subject=False ):
    dirIs = get_directory_path( set_lemmatizer, use_stop_list );
    datasets = [];
    regex = re.compile('[^a-zA-Z ]')
    for i in range(1,11):
        mailsToIsSpam = {}
        tempDir = dirIs + '/part'+str(i)+'/';
        for r, d, f in os.walk(tempDir):
            for file in f:
                isSpam = False;
                if( 'spmsg' in file ):
                    isSpam = True;
                fileData = open(tempDir +file,'r').read()
                re.sub("[^a-zA-Z ]",' ', fileData);
                re.sub("[ ]+", ' ', fileData);
                if not include_subject:
                    fileData = '\n'.join(fileData.split('\n')[2:])
                mailsToIsSpam[ fileData ] = isSpam;
        datasets.append( mailsToIsSpam );
    return datasets;

### Question is do we need to weed out single characters like *, \n, etc that get into our input before we train/test our learning algorithm ? Those words may be occuring with very very low probability, so let's try it out without that.

In [2]:
#Assumption that 10% of emails we receive are spam
prob_email_is_spam = 0.1; #10 percent of mails
prob_email_is_non_spam = 1 - prob_email_is_spam;
spam = 'spam'
notspam = 'nspam'

def train( training_data_list ):
    #we need to learn P( word = w | mail = spam )
    category_to_word_to_count = {spam:{}, notspam:{}};
    category_count = {spam:0, notspam:0};
    word_map = {};
    for training_data in training_data_list:
        for data in training_data.keys():
            isSpam = training_data[ data ]
            freq_map = mail_content_to_word_freq_map( data )
            total_words = sum( freq_map.values() )
            if isSpam:
                category = 'spam'
            else:
                category = 'nspam'
            category_count[category] = category_count[category] + 1;
            for word in freq_map.keys():
                if( not( word in category_to_word_to_count[category]) ):
                    category_to_word_to_count[category][word] = 0;
                category_to_word_to_count[category][word] = category_to_word_to_count[category][word] + 1;
                word_map[ word ] = True;
    
    prob_of_word_in_spam_mail = {};
    prob_of_word_in_non_spam_mail = {};
    vocabsize = len( word_map.keys() )
    for word in word_map.keys():
        wc_in_spam = 0;
        wc_in_non_spam = 0;
        if word in category_to_word_to_count[spam].keys():
            wc_in_spam = category_to_word_to_count[spam][word];
        if word in category_to_word_to_count[notspam].keys():
            wc_in_non_spam = category_to_word_to_count[notspam][word];
        prob_of_word_in_spam_mail[word] = ( 1 + wc_in_spam )/(wc_in_spam + wc_in_non_spam + vocabsize);
        prob_of_word_in_non_spam_mail[word] = ( 1 + wc_in_non_spam)/(wc_in_spam + wc_in_non_spam + vocabsize);
    return [ prob_of_word_in_spam_mail, prob_of_word_in_non_spam_mail ]

def test( test_data_list, train_metadata ):
    [ prob_word_in_spam, prob_word_in_non_spam ] = train_metadata;
    testing_accuracy = [ 0.0, 0.0 ];
    for test_data in test_data_list:
        for data in test_data.keys():
            freq_map = mail_content_to_word_freq_map( data );
            pm_is_spam = math.log(prob_email_is_spam);
            pm_is_non_spam = math.log(prob_email_is_non_spam);
            for word in freq_map.keys():
                if word in prob_word_in_spam.keys():
                    pm_is_spam += math.log( prob_word_in_spam[ word ] );
                if word in prob_word_in_non_spam.keys():
                    pm_is_non_spam += math.log( prob_word_in_non_spam[ word ] );
            
            isActuallySpam = test_data[ data ];
            if ( pm_is_spam >= pm_is_non_spam ) and isActuallySpam :
                    testing_accuracy[0] = testing_accuracy[0] + 1;
            elif ( pm_is_spam < pm_is_non_spam ) and ( not isActuallySpam ) :
                    testing_accuracy[0] = testing_accuracy[0] + 1;
            testing_accuracy[1] = testing_accuracy[1] + 1;
    return testing_accuracy[0]/testing_accuracy[1];

#P( spam | words in email ) = P( words in email | spam )*P( email is spam ) / P( words in email ) 
#P( words in email ) = P( words in email | email is spam )*P( email is spam ) + P( words in email | email is not spam )*P( email is not spam)

#P( words in email | spam ) = Multiply( for each( P( word in email | spam )))

In [3]:
for lemmatizer in [True, False]:
    for stop in [True, False]:
        datasets = get_datasets( lemmatizer, stop );
        train_set = datasets[0:7];
        test_set = datasets[7:10];
        train_metadata = train( train_set );
        test_acc = test( test_set, train_metadata )
        print('Lemmatizer {} and Stop {} -> {}'.format(lemmatizer, stop, test_acc));


Lemmatizer True and Stop True -> 0.8434579439252337
Lemmatizer True and Stop False -> 0.8422897196261683
Lemmatizer False and Stop True -> 0.8422897196261683
Lemmatizer False and Stop False -> 0.8422897196261683
