In [5]:
import glob
import math

ALPHA = 1
V = 172000

def count_words(directory):
    words = {}
    count = 0

    for filename in glob.glob(directory):
        f = open(filename)
        for line in f.readlines():
            line = line.strip("\n")
            if line not in words:
                words[line] = 1
            else:
                words[line] += 1
            count += 1
    return words, count

def run_model(directory, spam, prob_spam, prob_unseen_spam, ham, prob_ham, 
                prob_unseen_ham, truth_table):
    classification = {}
    differences = 0
    total_email = 0

    for email in glob.glob(directory):
        total_prob_ham = 0
        total_prob_spam = 0
        f = open(email)

        for word in f.readlines():
            word = word.strip("\n")
            if word not in ham:
                total_prob_ham += prob_unseen_ham
            else:
                total_prob_ham += ham[word]

            if word not in spam:
                total_prob_spam += prob_unseen_spam
            else:
                total_prob_spam += spam[word]

        email = email.split("/")
        email = email[len(email) - 1].strip(".words")

        if email in truth_table:
            truth = "Spam"
        else:
            truth = "Ham"

        if total_prob_ham > total_prob_spam:
            generated = "Ham"
            classification[email] = {"classification": generated, "truth": truth}
        else:
            generated = "Spam"
            classification[email] = {"classification": generated, "truth": truth}
        
        if generated != truth:
            differences += 1
        total_email += 1

    return classification, 1 - (differences/total_email)

def populate_truth(directory):
    truth = set()

    for filename in glob.glob(directory):
        f = open(filename)
        for line in f.readlines():
            line = line.strip("\n")
            truth.add(line)
    return truth

def calculate_probabilities(total_words, total_class, words):

    for word in words:
        curr_count = words[word] + ALPHA
        log_prob = math.log(curr_count/(total_class + (V * ALPHA)))
        words[word] = log_prob
    
    prob_class = math.log(total_class/total_words)
    prob_unseen_class = ALPHA/(total_class +(V*ALPHA))
    prob_unseen_class = math.log(prob_unseen_class)

    return prob_class, prob_unseen_class

def main():
    ham = "./data/ham/*"
    spam = "./data/spam/*"
    test = "./data/test/*"
    truth_file = "./data/truthfile*"

    dict_ham, total_ham = count_words(ham)
    dict_spam, total_spam = count_words(spam)
    total_words = total_ham + total_spam

    prob_ham, prob_unseen_ham = calculate_probabilities(total_words, total_ham, dict_ham)
    prob_spam, prob_unseen_spam = calculate_probabilities(total_words, total_spam, dict_spam)

    truth_table = populate_truth(truth_file)

    results, accuracy = run_model(test, dict_spam, prob_spam, prob_unseen_spam, 
                                    dict_ham, prob_ham, prob_unseen_ham, truth_table)
    for key, val in results.items():
        print(key, val)
    print(accuracy)
main()
    
    


89 {'classification': 'Ham', 'truth': 'Ham'}
74 {'classification': 'Spam', 'truth': 'Ham'}
31 {'classification': 'Spam', 'truth': 'Spam'}
49 {'classification': 'Ham', 'truth': 'Ham'}
90 {'classification': 'Ham', 'truth': 'Ham'}
28 {'classification': 'Spam', 'truth': 'Spam'}
50 {'classification': 'Spam', 'truth': 'Spam'}
15 {'classification': 'Spam', 'truth': 'Spam'}
9 {'classification': 'Ham', 'truth': 'Ham'}
100 {'classification': 'Ham', 'truth': 'Ham'}
52 {'classification': 'Ham', 'truth': 'Ham'}
17 {'classification': 'Spam', 'truth': 'Spam'}
92 {'classification': 'Ham', 'truth': 'Ham'}
76 {'classification': 'Spam', 'truth': 'Ham'}
33 {'classification': 'Ham', 'truth': 'Ham'}
72 {'classification': 'Spam', 'truth': 'Ham'}
37 {'classification': 'Spam', 'truth': 'Spam'}
56 {'classification': 'Ham', 'truth': 'Ham'}
13 {'classification': 'Spam', 'truth': 'Spam'}
96 {'classification': 'Ham', 'truth': 'Ham'}
69 {'classification': 'Spam', 'truth': 'Ham'}
94 {'classification': 'Ham', 'truth':