# Homework 2
## Charles Kornoelje
### CS 344

## Problem 1: Spam filter

In [3]:
from collections import Counter

# Filter based on Paul Graham’s A Plan for Spam, and Prof. VL's email.

# Minimum occurrence threshold count
THRESHOLD = 1

spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

def count_occurrences(corpus, count):
    """ Counts the number of word occurrences in the corpora.
    Referenced: https://www.geeksforgeeks.org/python-combine-two-dictionary-adding-values-for-common-keys/
    https://stackoverflow.com/questions/764235/dictionary-to-lowercase-in-python
    """
    for email in corpus:
        count += Counter(email)
    return dict((k.lower(), v) for k, v in count.items())
    
def probability_email_containing_word_is_spam(word):
    """
    Adapted from the article
    Finds the probability an email is spam if it contains the word
    Closer to 1, means more probable to be spam.
    """
    g = 2*good[word] if word in good else 0
    b = bad[word] if word in bad else 0
    if g + b > THRESHOLD:  # changed to one
        return max(0.01, min(0.99, min(1.0, b / nbad) / (min(1.0, g / ngood) + min(1.0, b / nbad))))
    else:
        return 0

def calculate_spam_prob(email):
    """
    Will calculate the probability that an email is spam.
    The closer to one, the more probable it is the email is spam.
    """
    prod = 1
    complement = 1
    for element in email:
        prod *= probs[element.lower()]
        complement *= (1 - probs[element.lower()])
    return prod / (prod + complement)

# get word counts
bad = count_occurrences(spam_corpus, Counter())
good = count_occurrences(ham_corpus, Counter())
nbad = len(spam_corpus)
ngood = len(ham_corpus)

g_keys = list(good.keys())
b_keys = list(bad.keys())

# combine keys
# https://stackoverflow.com/questions/1319338/combining-two-lists-and-removing-duplicates-without-removing-duplicates-in-orig
combined_keys = b_keys
combined_keys.extend(k for k in g_keys if k not in combined_keys)

# create a third hash of probabilities
probs = dict()
for key in combined_keys:
    probs[key] = probability_email_containing_word_is_spam(key)

print(probs)



{'i': 0.5, 'am': 0.99, 'spam': 0.99, 'do': 0.3333333333333333, 'not': 0, 'like': 0.3333333333333333, 'that': 0, 'spamiam': 0, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01}


This dictionary contains the words of the corpora followed by their probabilities that the email
is spam if it contains that word. The closer to 1, the more probable the email
is spam.

Because the threshold is set to 1, words that only occur one like "not" have
a probability of zero because there is not enough information about the
word to know if it will show up more in spam than ham.

(Note: I wasn't sure if the threshold could be `b + g >= 1`, if that is the case
then the output changes slightly. But I think using `b + g > 1` makes more
sense to show we can't decide on probabilities if the word occurrence is low:
`{'i': 0.5, 'am': 0.99, 'spam': 0.99, 'do': 0.3333333333333333, 'not': 0.99, 'like': 0.3333333333333333, 'that': 0.99, 'spamiam': 0.99, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01}
`)


In [4]:
print(calculate_spam_prob(ham_corpus[0]))
print(calculate_spam_prob(ham_corpus[1]))
print(calculate_spam_prob(spam_corpus[0]))
print(calculate_spam_prob(spam_corpus[1]))

2.6025508824397714e-09
0.3333333333333333
0.9999999895897965
0.0


It is interesting to note that the first two emails (ham emails) have very
low probabilities of being spam. And of the second two emails, the first one
has a high probability of being labeled spam, but the last email
is likely not spam because of the there is not enough
information about the words it contains to provide a label, so it 
considers it not spam.

This is Bayesian because it uses a bayesian combination of probabilities to 
decide if an email is spam or not, rather than a score. This spam filter
is Bayesian in its approach because it is able to describe the probability
of an event based on prior knowledge. The prior knowledge is the probability
of an email being spam if it contains a particular word. Using the combined
probabilities of emails containing words being spam, this filter can label 
incoming mail as spam or not thanks to the training from prior emails labeled 
spam and not spam.

# Problem 2: Bayesian Network


In [5]:
'''
This implements the Bayesian network shown in the text, Figure 14.12a
adapted from Prof. VL's network.py.
'''

from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

# From AIMA code (probability.py) - Fig. 14.12a - rain/wet grass example
cloudy = BayesNet([
    ('Cloudy', '', 0.50),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00}),
])


False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361


### 2.b)
The number of independent values in the full joint probability distribution
is 2^4 = 16. There are four variables, each with a T/F value.

### 2.c)
The number of independent values in the Bayesian network for this domain
is 9. This is due to the structure of the Bayesian network. We are able to make 
simplifications due to the assumptions based on how the network is structured.

Because Bayesian networks represent variables and their conditional probabilities,
less values are needed to be calculated than a full join probability distribution.


In [6]:
# *P*(Cloudy)
"""
*P*(C) = <0.5, 0.5> (Note: <True, False>) (Given)
"""
print("i.", enumeration_ask('Cloudy', dict(), cloudy).show_approx())

# *P*(Sprinkler | cloudy)
"""
*P*(Sprinkler | cloudy) = <0.1, 0.9> (Given)
"""
print("ii.", enumeration_ask('Sprinkler', dict(Cloudy=T), cloudy).show_approx())

# *P*(Cloudy| the sprinkler is running and it’s not raining)
"""
*P*(C | s^~r)
    = a*P*(C, s, ~r)
    = a<P(s^~r | c)P(c), P(s^~r | ~c)P(~c)>
    = a<P(s | c)P(~r | c)P(c), P(s | ~c)P(~r | ~c)P(~c)>
    = a<(.1)(.2)(.5), (.5)(.8)(.5)>
    = a<0.01, 0.2>
    = 4.76<0.01, 0.2>
    = <.952, .048>
"""
print("iii.", enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), cloudy).show_approx())

# *P*(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)
"""
*P*(W | c^s^r)
    = a*P*(W, c, s, r)
    = a<P(w, c, s, r), P(~w, c, s, r)>
    = a<P(w^c^s^r), P(~w^c^s^r)>
    = a<(.99)(.5)(.1)(.8), (.01)(.5)(.1)(.8)>
    = a<0.0396, 0.0004>
    = 25<0.0396, 0.0004>
    = <0.99, 0.01>
"""
print("iv.", enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), cloudy).show_approx())

# *P*(Cloudy | the grass is not wet)
"""
*P*(C | ~w)
    = *P*(~w | C)*P*(C)
    = aΣ(s, r)*P*(C, s, r, ~w)
    = skipped the writing out of P(events), very long.
    = a<(.5)[(.1)(.8)(.01)+(.1)(.2)(.1)+(.9)(.8)(.1)+(.9)(.2)(1)], (.5)[(.5)(.2)(.01)+(.5)(.8)(.1)+(.5)(.2)(.1)+(.5)(.8)(1)]>
    = a<0.1274, 0.2255>
    = 2.83<0.1274, 0.2255>
    = <.361, .639> 
"""
print("v.", enumeration_ask('Cloudy', dict(WetGrass=F), cloudy).show_approx())



i. False: 0.5, True: 0.5
ii. False: 0.9, True: 0.1
iii. False: 0.952, True: 0.0476
iv. False: 0.01, True: 0.99
v. False: 0.639, True: 0.361
