### We will use lingspam corpus for our mail spam filtering.
#### Let's start with function to get the data.

In [1]:
import pandas as pd
import os

def get_directory_path( set_lemmatizer : bool, use_stop_list: bool ):
    base_path = './lingspam_public/'
    directory_map = { (False, False):'bare',(True, False):'lemm',
              (True, True):'lemm_stop',(False,True):'stop' };
    directory = directory_map[(set_lemmatizer, use_stop_list)];
    return base_path + directory;

def mail_content_to_word_freq_map( email_body : str ):
    mail_content_to_word_f = {}
    for word in email_body.split(' '):
        if not ( word in mail_content_to_word_f.keys() ):
            mail_content_to_word_f[ word ] = 0;
        mail_content_to_word_f[ word ] = mail_content_to_word_f[ word ] + 1
    return mail_content_to_word_f;

def get_datasets( set_lemmatizer : bool, use_stop_list : bool, include_subject=False ):
    dirIs = get_directory_path( set_lemmatizer, use_stop_list );
    datasets = [];
    for i in range(1,11):
        mailsToIsSpam = {}
        tempDir = dirIs + '/part'+str(i)+'/';
        for r, d, f in os.walk(tempDir):
            for file in f:
                isSpam = False;
                if( 'spmsg' in file ):
                    isSpam = True;
                fileData = open(tempDir +file,'r').read()
                if not include_subject:
                    fileData = '\n'.join(fileData.split('\n')[2:])
                mailsToIsSpam[ fileData ] = isSpam;
        datasets.append( mailsToIsSpam );
    return datasets;

### Question is do we need to weed out single characters like *, \n, etc that get into our input before we train/test our learning algorithm ? Those words may be occuring with very very low probability, so let's try it out without that.

In [2]:
#Assumption that 10% of emails we receive are spam
prob_email_is_spam = 0.1; #10 percent of mails
prob_email_is_non_spam = 1 - prob_email_is_spam;

def train( training_data_list ):
    #we need to learn P( word = w | mail = spam )
    word_to_mail_spam_count = {}
    word_to_mail_non_spam_count = {}
    total_words_in_spam = 0;
    total_words_in_non_spam = 0;
    for training_data in training_data_list:
        for data in training_data.keys():
            freq_map = mail_content_to_word_freq_map( data );
            total_words = sum( freq_map.values() )
            isSpam = training_data[ data ]
            if isSpam:
                for word in freq_map.keys():
                    if( not( word in word_to_mail_spam_count.keys() ) ):
                        word_to_mail_spam_count[ word ] = 0;
                    word_to_mail_spam_count[ word ] = word_to_mail_spam_count[ word ] + 1;
            else:
                for word in freq_map.keys():
                    if( not( word in word_to_mail_non_spam_count.keys() ) ):
                        word_to_mail_non_spam_count[ word ] = 0;
                    word_to_mail_non_spam_count[ word ] = word_to_mail_non_spam_count[ word] + 1;
            if isSpam:
                total_words_in_spam += sum( freq_map.values() )
            else:
                total_words_in_non_spam += sum( freq_map.values() )
    prob_of_word_in_spam_mail = {};
    prob_of_word_in_non_spam_mail = {};
    for word in word_to_mail_spam_count.keys():
        prob_of_word_in_spam_mail[ word ] = word_to_mail_spam_count[ word ]/total_words_in_spam;
    for word in word_to_mail_non_spam_count.keys():
        prob_of_word_in_non_spam_mail[ word ] = word_to_mail_non_spam_count[ word ]/total_words_in_non_spam;
    return [ prob_of_word_in_spam_mail, prob_of_word_in_non_spam_mail ]

def test( test_data_list, train_metadata ):
    [ prob_word_in_spam, prob_word_in_non_spam ] = train_metadata;
    testing_accuracy = [ 0.0, 0.0 ];
    for test_data in test_data_list:
        for data in test_data.keys():
            freq_map = mail_content_to_word_freq_map( data );
            pm_is_spam = prob_email_is_spam;
            pm_is_non_spam = prob_email_is_non_spam;
            for word in freq_map.keys():
                if word in prob_word_in_spam.keys():
                    pm_is_spam *= prob_word_in_spam[ word ]
                if word in prob_word_in_non_spam.keys():
                    pm_is_non_spam *= prob_word_in_non_spam[ word ];
            if ( pm_is_spam > pm_is_non_spam ):
                if test_data[data]:
                    testing_accuracy[0] = testing_accuracy[0] + 1;
            else:
                if not test_data[data]:
                    testing_accuracy[0] = testing_accuracy[0] + 1;
            testing_accuracy[1] = testing_accuracy[1] + 1;
    return testing_accuracy[0]/testing_accuracy[1];

#P( spam | words in email ) = P( words in email | spam )*P( email is spam ) / P( words in email ) 
#P( words in email ) = P( words in email | email is spam )*P( email is spam ) + P( words in email | email is not spam )*P( email is not spam)

#P( words in email | spam ) = Multiply( for each( P( word in email | spam )))

In [4]:
for lemmatizer in [True, False]:
    for stop in [True, False]:
        datasets = get_datasets( lemmatizer, stop );
        train_set = datasets[0:7];
        test_set = datasets[7:10];
        train_metadata = train( train_set );
        test_acc = test( test_set, train_metadata )
        print('Lemmatizer {} and Stop {} -> {}'.format(lemmatizer, stop, test_acc));


Lemmatizer True and Stop True -> 0.5969626168224299
Lemmatizer True and Stop False -> 0.6740654205607477
Lemmatizer False and Stop True -> 0.6086448598130841
Lemmatizer False and Stop False -> 0.677570093457944
