# Homework 2

In [3]:
# Tokenized corpus of spam and nonspam mail. Each list in the lists represent tokens in a message.
spam_corpus = [["i", "am", "spam", "spam", "i", "am"], ["i", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# Receives corpora of spam and nonspam mails respectively
# Returns a hashtable (dictionary) with probabilities given a corpus
def spam_filter(spam_mail_corpus, nonspam_mail_corpus):
    # Number of non-spam and spam messages respectively
    # aka sizes of each corpus
    ngood = len(nonspam_mail_corpus)
    nbad = len(spam_mail_corpus)

    # List of words of each corpus
    # Iterate through each list of lists of words
    good_tokens = [token for message in nonspam_mail_corpus for token in message]
    bad_tokens = [token for message in spam_mail_corpus for token in message]

    # Complete Tokens(Words)
    # Get unique tokens through making the list a set
    # Get the union between the sets
    tokens = list(set(good_tokens) | set(bad_tokens))

    # First Hashtable:
    # A mapping of nonspam tokens to its number of occurrences
    # Map from ham_corpus tokens(words) to number of occurrences (good)
    good = get_token_occurrences_map(nonspam_mail_corpus, good_tokens, tokens)

    # Second Hashtable:
    # A mapping of spam tokens to its number of occurrences
    # Map from spam_corpus tokens(words) to number of occurrences (bad)
    bad = get_token_occurrences_map(spam_mail_corpus, bad_tokens, tokens)

    # Third Hashtable:
    # A mapping of each token to the probability that an
    # email containing it is spam (algorithm given by Graham)
    return get_token_spam_probability_map(good, bad, ngood, nbad, tokens)


# Helper function to get the first and second hashtables
# Returns a dictionary with each token(word) mapping to occurrences
def get_token_occurrences_map(mail_corpus, message_tokens, tokens):
    token_occurrences_map = {}
    for token in tokens:
       token_occurrences_map[token] = message_tokens.count(token)

    return token_occurrences_map

# Helper function to get the third hashtable
# Returns a dictionary mapping each token to the probability that an email containing it is spam
def get_token_spam_probability_map(good_token_occurrences_map, bad_token_occurrences_map, ngood, nbad, tokens):
    token_spam_probability_map = {}
    for token in tokens:
        # Double good map to slightly avoid false-positives
        g = float(2 * good_token_occurrences_map[token])
        b = float(bad_token_occurrences_map[token])
        # Check against min threshold
        if g + b > 1:
            # Divide by number of emails
            token_spam_probability_map[token] = max(0.01, min(0.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            token_spam_probability_map[token] = 0

    return token_spam_probability_map

# Do the actual filtering given a probability map of words
def is_spam(message, probability_map):
    # Probability of tokens never seen previously
    never_seen_probability = 0.4
    # The product of the elements of probability
    product = 1.0
    # Complement product
    complement_product = 1.0

    # Algorithm given by Paul Graham
    for token in message:
        if token in probability_map:
            probability = probability_map[token]
        else:
            probability = never_seen_probability
        product *= probability
        complement_product *= (1.0 - probability)

    return product / (product + complement_product)

# Get the probability map
prob_map = spam_filter(spam_corpus, ham_corpus)

# See probability of mail being spam
print(is_spam(["hehe"], prob_map))
print(is_spam(["spam"], prob_map))
print(is_spam(["do", "it"], prob_map))

# TODO
# Actual spam mail message with data from my spam inbox (including a tokenizer)
# Actual working system (with aggregate filter additions and filter reruns)

0.4
0.99
0.25


In [2]:
from aima.probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

T, F = True, False

grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T:0.1, F:0.5}),
    ('Rain', 'Cloudy', {T:0.8, F:0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T):0.99, (T,F):0.9, (F,T):0.9, (F,F):0.0}),
    ])

# P(Cloudy)
print(enumeration_ask('Cloudy', dict(), grass).show_approx())

# P(Sprinker | cloudy)
print(enumeration_ask('Sprinkler', dict(Cloudy=T), grass).show_approx())

# P(Cloudy| the sprinkler is running and it’s not raining)
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), grass).show_approx())

# P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), grass).show_approx())

# P(Cloudy | the grass is not wet)
print(enumeration_ask('Cloudy', dict(WetGrass=F), grass).show_approx())

False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361
