In [17]:
import heapq
from collections import Counter

In [18]:
def score_word(word, good, bad, ngood, nbad):
    """
    given a word, return the probability that an email containing it is spam
    """
    g = 2 * good.get(word, 0)
    b = bad.get(word, 0)
    if g + b > 1:
        return max(0.01, min(0.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
    return 0

In [27]:
def choose_words(words, good, bad, ngood, nbad, num_tokens=15, neutral=0.5, debug=False):
    """
    chose the num_tokens most interesting words and their scores (from score_word),
        where "interesting" means the score is different from the neutral score
    return a heap (essentially a list) of (word, score) pairs
    """
    chosen = []
    for word in words:
        score = abs(score_word(word, good, bad, ngood, nbad) - neutral)
        if len(chosen) < num_tokens:
            heapq.heappush(chosen, (score, word))  # sorted first by score, then by word
        else:
            # add (score, word) to the heap, then remove the smallest item (which could be the (score, word) we just added)
            heapq.pushpop(chosen, (score, word))
    if debug:
        print(chosen)
    return chosen

In [29]:
def get_spam_prob(pairs):
    """
    pairs is the output from choose_words
    
    accumulator pattern: 
    prod = the product of the elements of probs
    comp = the product of the complement of the elements of probs
    """
    prod = 1
    comp = 1
    for prob, word in pairs:
        prod *= prob
        comp *= (1 - prob)
    return prod / (prod + comp)

In [21]:
def count_good_bad(good_corpus, bad_corpus):
    ngood = len(good_corpus)
    nbad = len(bad_corpus)
    good = Counter()
    bad = Counter()
    for doc in good_corpus:
        for word in doc:
            good[word] += 1
    for doc in bad_corpus:
        for word in doc:
            bad[word] += 1
    return good, bad, ngood, nbad

In [22]:
def evaluate_doc(doc, good, bad, ngood, nbad, debug=False):
    pairs = choose_words(doc, good, bad, ngood, nbad, debug=debug)
    return get_spam_prob(pairs)

In [23]:
spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

In [31]:
good, bad, ngood, nbad = count_good_bad(ham_corpus, spam_corpus)
print('evaluating ham')
for doc in ham_corpus:
    print(doc)
    print(evaluate_doc(doc, good, bad, ngood, nbad, debug=True))
    print()
print('evaluating spam')
for doc in spam_corpus:
    print(doc)
    print(evaluate_doc(doc, good, bad, ngood, nbad, debug=True))
    print()

evaluating ham
['do', 'i', 'like', 'green', 'eggs', 'and', 'ham']
[(0.16666666666666669, 'do'), (0.49, 'eggs'), (0.16666666666666669, 'like'), (0.49, 'i'), (0.49, 'green'), (0.49, 'and'), (0.49, 'ham')]
0.031709909197758505

['i', 'do']
[(0.16666666666666669, 'do'), (0.49, 'i')]
0.1611842105263158

evaluating spam
['I', 'am', 'spam', 'spam', 'I', 'am']
[(0.49, 'I'), (0.49, 'I'), (0.49, 'am'), (0.49, 'spam'), (0.49, 'am'), (0.49, 'spam')]
0.4402784629577644

['I', 'do', 'not', 'like', 'that', 'spamiam']
[(0.16666666666666669, 'do'), (0.16666666666666669, 'like'), (0.5, 'not'), (0.49, 'I'), (0.5, 'that'), (0.5, 'spamiam')]
0.03700906344410877



## 2. Bayesian Network

In [3]:
import sys
sys.path.append('/home/cmd38/344/cs344-code/tools/aima')  # need to be able to find the module

In [4]:
from probability import BayesNet, JointProbDist, enumeration_ask, enumerate_joint_ask

In [6]:
T = True
F = False
# wet_joint = BayesNet([
#     ('Cloudy', '', 0.5),
#     ('Sprinkler Rain')
#     ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00}),
#     ])
wet_bayes = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.10, F:0.50}),
    ('Rain', 'Cloudy', {T: 0.80, F:0.20}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00}),
    ])

In [7]:
print(enumeration_ask('Cloudy', dict(), wet_bayes).show_approx())

False: 0.5, True: 0.5


In [12]:
(0.5, 0.5)  # P(Cloudy is given to us)

(0.5, 0.5)

In [8]:
print(enumeration_ask('Sprinkler', dict(Cloudy=T), wet_bayes).show_approx())

False: 0.9, True: 0.1


In [14]:
(1 - 0.1, 0.1)  # P(Sprinkler|Cloudy) is given as 0.1

(0.9, 0.1)

In [9]:
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), wet_bayes).show_approx())

False: 0.952, True: 0.0476


In [10]:
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), wet_bayes).show_approx())

False: 0.01, True: 0.99


In [11]:
print(enumeration_ask('Cloudy', dict(WetGrass=F), wet_bayes).show_approx())

False: 0.639, True: 0.361
