In [11]:
# Probability function
# Parameters: List of words, num of ham and spam messages, the ham and spam dictionary
# Return: Dictionary of probabilities
def probability(word_list, ham_n, spam_n, ham_d, spam_d):
    probabilities = {}
    for word in word_list:
        if word in ham_d:
            g = 2 * ham_d[word]
        else:
            g = 0
        if word in spam_d:
            b = spam_d[word]
        else:
            b = 0
        # unless g+b < 5 would need = to be equivalent
        # and to make up for the fact that there are
        # very few words in the emails entered
        if (g + b) >= 1:
            min1 = min(1.0, b / spam_n)
            min2 = min(1.0, g / ham_n)
            probabilities[word] = max(0.01, min(0.99, min1 / (min2 + min1)))
        else:
            probabilities[word] = 0
    return probabilities

# Function to make email list
# Parameters: Ham and Spam corpus
# Return: List of all the words in both emails
def make_email_list(spam, ham):
    list = []
    for mesg in ham:
        for word in mesg:
            if word.lower() not in list:
                list.append(word.lower())
    for mesg in spam:
        for word in mesg:
            if word.lower() not in list:
                list.append(word.lower())
    return list


# Function to print out probabilities for entered email
# Parameter: The dictionary of the words and calculated ham or spam probability
# Return: void
def print_email_values(prob_table):
    for word in prob_table:
        print(word + ': ' + str(prob_table[word]))

# Function spam_or_ham to check an email
# Parameters: Ham and Spam corpus
# Return: void
def spam_or_ham(spam, ham):
    ham_dict = {}
    spam_dict = {}
    ham_n = 0
    spam_n = 0

    for mesg in ham:
        ham_n += 1
        for word in mesg:
            if word.lower() in ham_dict:
                ham_dict[word.lower()] = ham_dict[word.lower()] + 1
            else:
                ham_dict[word.lower()] = 1

    for mesg in spam:
        spam_n += 1
        for word in mesg:
            if word.lower() in spam_dict:
                spam_dict[word.lower()] = spam_dict[word.lower()] + 1
            else:
                spam_dict[word.lower()] = 1

    email_words = make_email_list(spam, ham)
    list1 = ['i', 'am', 'Jacob']

    email_prob = probability(email_words, ham_n, spam_n, ham_dict, spam_dict)
    print_email_values(email_prob)

    product = 1
    comps = 1
    for probs in email_prob:
        if email_prob[probs] == 0:
            product *= 0.4
        else:
            product *= email_prob[probs]
        comp = 1 - email_prob[probs]
        comps *= comp
    answer = product / (product + comps)

    if answer > 0.9:
        print("It's spam!!")
    else:
        print("It's ham!!")


spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

spam_or_ham(spam_corpus, ham_corpus)

    

do: 0.3333333333333333
i: 0.5
like: 0.3333333333333333
green: 0.01
eggs: 0.01
and: 0.01
ham: 0.01
am: 0.99
spam: 0.99
not: 0.99
that: 0.99
spamiam: 0.99
It's spam!!


The spam filter behaves similarly to a Bayesian network, as you give the
filter certain messages, and label them as being in spam or ham messages. From this, a Bayesian network is built
that gives you the probability given certain words.

A given message is then classified according to its words and their probabilities as being spam or not.

# 2

Bayesian Network

In [12]:
from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask
# Utility variables
T, F = True, False

# From AIMA code (probability.py)
grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.9, (F, T): 0.9, (F,F): 0})
    ])

b.  The number of independent values in the full joint probability would be 4^2, which would be 16.

c. The number of independent values in the Bayesian Network for this domain would be 9. From this we can see that using a Bayesian network simplifies the problem, and gives us less numbers to work with.

P(Cloudy) = <0.5, 0.5>


In [13]:
print('\nProbability of Cloudy')
print(enumeration_ask('Cloudy', dict(),grass).show_approx())


Probability of Cloudy
False: 0.5, True: 0.5


P(Sprinkler | Cloudy) = <0.10, 0.90>

In [14]:
print('\nProbability of Sprinkler given its cloudy')
print(enumeration_ask('Sprinkler', dict(Cloudy=T), grass).show_approx())


Probability of Sprinkler given its cloudy
False: 0.9, True: 0.1


P(Cloudy | Sprinkler ^ nRain)

= <P(Cloudy) * P(Sprinkler | Cloudy) * P(nRain | Cloudy), P(Cloudy) * P(Sprinkler | nCloudy) * (nRain | nCloudy) 

= <0.5 * 0.1 * 0.2, 0.5 * 0.5 * 0.8>

= <0.01, 0.20>

0.01 + 0.20 = 0.21

<0.01, 0.20> / 0.21

= <0.048, 0.952>

In [15]:
print('\nProbability of given sprinklers are on and its not raining')
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), grass).show_approx())


Probability of given sprinklers are on and its not raining
False: 0.952, True: 0.0476


P(WetGrass | Sprinkler ^ Cloudy ^ Rain) 

< P(WetGrass), P(nWetGrass>

<0.99, 0.01>

In [16]:
print('\nProbability of WetGrass given the sprinklers are on and its cloudy and its raining')
print(enumeration_ask('WetGrass',  dict(Cloudy=T, Sprinkler=T, Rain=T), grass).show_approx())


Probability of WetGrass given the sprinklers are on and its cloudy and its raining
False: 0.01, True: 0.99


P(Cloudy | nWetGrass) = 

< P(Cloudy) * P(Sprinkler | Cloudy) * P(Rain | Cloudy) * P(nWG | Sprinkler ^ Rain)
 \+ P(Cloudy) * P(nSprinkler | Cloudy) * P(nRain | Cloudy) * P(nWG | nSprinkler ^ nRain)
 \+ P(Cloudy) * P(nSprinkler | Cloudy) * P(Rain | Cloudy) * P(nWG | nSprinkler ^ Rain)
 \+ P(Cloudy) * P(Sprinkler | Cloudy) * P(nRain | Cloudy) * P(nWG | Sprinkler ^ nRain)
 
 , P(nCloudy) * P(Sprinkler | nCloudy) * P(Rain | nCloudy) * P(nWG | Sprinkler ^ Rain) 
  \+ P(nCloudy) * P(nSprinkler | nCloudy) * P(nRain | nCloudy) * P(nWG | nSprinkler ^ nRain)
 \+ P(nCloudy) * P(nSprinkler | nCloudy) * P(Rain | nCloudy) * P(nWG | nSprinkler ^ Rain)
 \+ P(nCloudy) * P(Sprinkler | nCloudy) * P(nRain | nCloudy) * P(nWG | Sprinkler ^ nRain) >
 
 < 0.5 * 0.1 * 0.8 * 0.01 + 0.5 * 0.9 * 0.2 * 1 + 0.5 * 0.9 * 0.8 * 0.1 + 0.5 * 0.1 * 0.2 * 0.1
 
 , 0.5 * 0.5 * 0.2 * 0.01 + 0.5 * 0.5 * 0.8 * 1 + 0.5 * 0.5 * 0.2 * 0.1 + 0.5 * 0.5 * 0.8 * 0.1
 
 <0.0004 + 0.09 + 0.036 + 0.001, 0.0005 + 0.2 + 0.005 + 0.02>
 
 = <0.1274, 0.2255>
 
 0.1274 + 0.2255 = 0.3529
 
 <0.1274, 0.2255> / 0.3529
 
 = <0.361, 0.639> 
 

In [17]:
print('\nProbability of Cloudy given you do not have wet grass')
print(enumeration_ask('Cloudy', dict(WetGrass=F),grass).show_approx())


Probability of Cloudy given you do not have wet grass
False: 0.639, True: 0.361
