# Homework 2: CS344
#### Ian Park

## 1

In [4]:
# Tokenized corpus of spam and nonspam mail. Each list in the lists represent tokens in a message.
spam_corpus = [["i", "am", "spam", "spam", "i", "am"], ["i", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# Receives corpora of spam and nonspam mails respectively
# Returns a hashtable (dictionary) with probabilities given a corpus
def spam_filter(spam_mail_corpus, nonspam_mail_corpus):
    # Number of non-spam and spam messages respectively
    # aka sizes of each corpus
    ngood = len(nonspam_mail_corpus)
    nbad = len(spam_mail_corpus)

    # List of words of each corpus
    # Iterate through each list of lists of words
    good_tokens = [token for message in nonspam_mail_corpus for token in message]
    bad_tokens = [token for message in spam_mail_corpus for token in message]

    # Complete Tokens(Words)
    # Get unique tokens through making the list a set
    # Get the union between the sets
    tokens = list(set(good_tokens) | set(bad_tokens))

    # First Hashtable:
    # A mapping of nonspam tokens to its number of occurrences
    # Map from ham_corpus tokens(words) to number of occurrences (good)
    good = get_token_occurrences_map(nonspam_mail_corpus, good_tokens, tokens)

    # Second Hashtable:
    # A mapping of spam tokens to its number of occurrences
    # Map from spam_corpus tokens(words) to number of occurrences (bad)
    bad = get_token_occurrences_map(spam_mail_corpus, bad_tokens, tokens)

    # Third Hashtable:
    # A mapping of each token to the probability that an
    # email containing it is spam (algorithm given by Graham)
    return get_token_spam_probability_map(good, bad, ngood, nbad, tokens)


# Helper function to get the first and second hashtables
# Returns a dictionary with each token(word) mapping to occurrences
def get_token_occurrences_map(mail_corpus, message_tokens, tokens):
    token_occurrences_map = {}
    for token in tokens:
       token_occurrences_map[token] = message_tokens.count(token)

    return token_occurrences_map

# Helper function to get the third hashtable
# Returns a dictionary mapping each token to the probability that an email containing it is spam
def get_token_spam_probability_map(good_token_occurrences_map, bad_token_occurrences_map, ngood, nbad, tokens):
    token_spam_probability_map = {}
    for token in tokens:
        # Double good map to slightly avoid false-positives
        g = float(2 * good_token_occurrences_map[token])
        b = float(bad_token_occurrences_map[token])
        # Check against min threshold
        if g + b > 0.9:
            # Divide by number of emails
            token_spam_probability_map[token] = max(0.01, min(0.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            token_spam_probability_map[token] = 0

    return token_spam_probability_map

# Do the actual filtering given a probability map of words
def is_spam(message, probability_map):
    # Probability of tokens never seen previously
    never_seen_probability = 0.4
    # The product of the elements of probability
    product = 1.0
    # Complement product
    complement_product = 1.0

    # Algorithm given by Paul Graham
    for token in message:
        if token in probability_map:
            probability = probability_map[token]
        else:
            probability = never_seen_probability
        product *= probability
        complement_product *= (1.0 - probability)

    return product / (product + complement_product)

# Get the probability map
prob_map = spam_filter(spam_corpus, ham_corpus)

# See probability of mail being spam
print("Spam:")
print(is_spam(["spam", "spamiam"], prob_map)) # Spam
print("Ham:")
print(is_spam(["green", "eggs"], prob_map)) # Ham
print("Mix Ham and Spam:")
print(is_spam(["spam", "ham"], prob_map)) # Mix Ham and Spam
print("Unseen word:")
print(is_spam(["hehe"], prob_map)) # Unseen word
print("Unseen word with ham:")
print(is_spam(["do", "hehe"], prob_map)) # Unseen word with ham
print("Unseen word with spam:")

print(is_spam(["spam", "hehe"], prob_map)) # Unseen word with spam
print("Spam message 1:")
print(is_spam(spam_corpus[0], prob_map)) # Spam message 1
print("Spam message 2:")
print(is_spam(spam_corpus[1], prob_map)) # Spam message 2
print("Non-Spam message 1:")
print(is_spam(ham_corpus[0], prob_map)) # Non-spam message 1
print("Non-Spam message 2:")
print(is_spam(ham_corpus[1], prob_map)) # Non-spam message 2

print("DONE")

# TODO
# Actual spam mail message with data from my spam inbox (including a token filter)
# Actual working system (with aggregate filter additions and filter reruns)

Spam:
0.9998979800040808
Ham:
0.00010201999591920018
Mix Ham and Spam:
0.4999999999999997
Unseen word:
0.4
Unseen word with ham:
0.25
Unseen word with spam:
0.9850746268656716
Spam message 1:
0.9999999895897965
Spam message 2:
0.999995877576386
Non-Spam message 1:
2.6025508824397714e-09
Non-Spam message 2:
0.3333333333333333
DONE


**What makes this approach to SPAM Bayesian?**

We are approaching this problem with probabilities and data (of initial frequencies and each resulting data) instead of approaching it with good-old fashioned programming that uses a set of rules to get our deterministic answer. As Graham explains, by representing our data(words) according to probabilities, we have a much more accuracy in knowing whether or not the mail containing the probabilities of each word is spam.

## 2.a

In [2]:
from aima.probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

T, F = True, False

grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T:0.1, F:0.5}),
    ('Rain', 'Cloudy', {T:0.8, F:0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T):0.99, (T,F):0.90, (F,T):0.90, (F,F):0.00}),
    ])

# P(Cloudy)
print(enumeration_ask('Cloudy', dict(), grass).show_approx())

# P(Sprinker | cloudy)
print(enumeration_ask('Sprinkler', dict(Cloudy=T), grass).show_approx())

# P(Cloudy| the sprinkler is running and it’s not raining)
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), grass).show_approx())

# P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), grass).show_approx())

# P(Cloudy | the grass is not wet)
print(enumeration_ask('Cloudy', dict(WetGrass=F), grass).show_approx())

False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361


## 2.b
$2^4 = 16$ independent values.
We have four different variables that can be either $True$ or $False$.

## 2.c
$9$ independent values.
This is attained by looking at the Bayesian network figure and counting the number of independent values.

## 2.d

### i.
From the Graph

$P(\text{Cloudy}) = <0.5, 0.5>$

### ii. 
From the Graph

$P(\text{Sprinker  |  Cloudy}) = <0.1, 0.9>$

### iii. 
\begin{align}
\ P(\text{Cloudy  |  the sprinkler is running and it’s not raining}) & = P(C | s \land \neg r) \\
\ & = \alpha  \times \langle 0.50.10.2, 0.50.50.8 \rangle \\
\ & = \alpha \times \langle 0.01, 0.2 \rangle \\
\ & = \langle 0.0476, 0.9524 \rangle \\
\end{align}

### iv. 
\begin{align}
\ P(\text{WetGrass  |  it’s cloudy, the sprinkler is running and it’s raining}) & = \alpha \times \langle P(W \land c \land s \land r), P(\neg W \land c \land s \land r) \rangle \\
\ & = \alpha \times \langle 0.5 \times 0.1 \times 0.8 \times 0.99, 0.5 \times 0.1 \times 0.8 \times 0.01 \rangle \\
\ & = \alpha \times \langle 0.0396, 0.0004 \rangle \\
\ & = \langle 0.99, 0.01 \rangle \\
\end{align}

### v. 
\begin{align}
\ P(\text{Cloudy  |  the grass is not wet}) & = P(C | \neg w) \\
\ & = \alpha \times \langle \sum_s ( \sum_r ( P(C) \times P(s \land r) \times P(g | s \land r) ) ) \rangle \\
\ & = \alpha \times \langle 0.5 \times 0.08 \times 0.01 + 0.5 \times 0.02 \times 0.10 + 0.5 \times 0.72 \times 0.10 + 0.5 \times 0.18 \times 1.00,0.5 \times 0.10 \times 0.01 + 0.5 \times 0.40 \times 0.10 + 0.5 \times 0.10 \times 0.10 + 0.5 \times 0.40 \times 1.00 \rangle \\
\ & = \alpha \times \langle 0.1274, 0.2255 \rangle \\
\ & = \langle 0.361, 0.639 \rangle \\
\end{align}