In [2]:
'''
Bayesian Spam Filter
Based on an algorithm by Paul Graham

Written by: Chris Walstra
'''

DEFAULT_PROBABILITY = 0.4

# Takes a frequency dictionary to add to
# a list of emails
# and an indicator of whether the emails are spam (1 = spam)
# Returns the dictionary updated for the occurrences of the words in the emails
def ParseEmails(freqDict, emails, spam):
    for email in emails:
        for word in email:
            word = word.lower()
            if word in freqDict.keys():
                if spam == 1:
                    hold = freqDict[word][0]
                    counter = freqDict[word][1]
                    counter += 1
                    freqDict[word] = (hold, counter)
                else:
                    hold = freqDict[word][1]
                    counter = freqDict[word][0]
                    counter += 1
                    freqDict[word] = (counter, hold)
            else:
                if spam == 1:
                    freqDict[word] = (0, 1)
                else:
                    freqDict[word] = (1, 0)
    return freqDict

# Takes a dictionary of words with spam/not spam occurrences
# the number of good and bad emails
# and the number of times a word needs to appear to be evaluated
# returns a dictionary containing the probability that each word is spam
def findProbabilityDict(freqDict, ngood, nbad, threshold=5):
    probabilityDict = {}
    for word in freqDict.keys():
        (g, b) = freqDict[word]
        g = 2*g
        
        if g + b >= threshold:
            probabilityDict[word] = max(0.01, min(0.99, min(1.0, b/nbad)/(min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            probabilityDict[word] = 0
    
    return probabilityDict

# Takes a dictionary of words indicating the probability that each word is spam
# and an email to test
# returns a boolean value indicating whether the email is spam or not
def isEmailSpam(probDict, email):
    # A complete implementation of this function would look only at the words with the 
    # most extreme probabilities - the most 
    prob = 1
    complement = 1
    for word in email:
        word = word.lower()
        if word in probDict.keys() and probDict[word] != 0: 
            prob = prob * probDict[word]
            complement = complement * (1 - probDict[word])
        else:
            prob = prob * .4
            complement = complement * .6
        
    spamProb = prob/(prob + complement)
    if spamProb > 0.9:
        return True
    else:
        return False

# Takes a string indicating what test set to use
# Outputs a dictionary of spam probabilities,
# and, if given, whether an email is likely to be spam
def SpamFilter(testSet):
    frequencyDict = {}

    # Must be a list of emails, where each email is a list of words
    # in a true implementation, this would be replaced by a much larger
    # list of words
    if testSet == "spam": 
        spamEmails = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
        safeEmails = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
        myEmail = ["I", "am", "spam"]
        threshold = 1
    elif testSet == "cars":
        spamEmails = [["Toyota", "is", "super", "fast", "and", "drives", "super", "good"], 
                      ["Honda", "drives", "like", "cloud"],
                      ["Nissan", "is", "bad"]]
        safeEmails = [["You", "should", "buy", "a", "Mazda"],
                      ["BMWs", "are", "very", "luxurious", "cars"],
                      ["You", "should", "purchase", "a", "Corvette", "and", "drive", "it", "like", "a",
                       "race", "car", "driver"]]
        myEmail = ["You", "should", "buy", "an", "Aston", "Martin", "like", "James", "Bond"]
        threshold = 1
    elif testSet == "gibberish":
        spamEmails = [["spam"], ["spam"], ["spam"], ["not", "not", "spam"],
                      ["ham"], ["ham"], ["ham"], ["ham"]]
        safeEmails = [["not", "spam"], ["not", "spam"], ["one", "million", "voices"], ["one", "million", "voices"]]
        threshold = 3
        myEmail = False
  
    frequencyDict = ParseEmails(frequencyDict, spamEmails, 1)
    frequencyDict = ParseEmails(frequencyDict, safeEmails, 0)
    
    probabilityDict = findProbabilityDict(frequencyDict, len(safeEmails), len(spamEmails), threshold)
    
    print(probabilityDict)
    
    # If an example email is given, check if it is likely to be spam
    if myEmail:
        if isEmailSpam(probabilityDict, myEmail):
            print("This email is spam!")
        else:
            print("This email is not spam!")


def Main():
    for test in ["spam", "cars", "gibberish"]:
        SpamFilter(test)

if __name__ == "__main__":
    Main()




{'i': 0.5, 'am': 0.99, 'spam': 0.99, 'do': 0.3333333333333333, 'not': 0.99, 'like': 0.3333333333333333, 'that': 0.99, 'spamiam': 0.99, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01}
This email is spam!
{'toyota': 0.99, 'is': 0.99, 'super': 0.99, 'fast': 0.99, 'and': 0.3333333333333333, 'drives': 0.99, 'good': 0.99, 'honda': 0.99, 'like': 0.3333333333333333, 'cloud': 0.99, 'nissan': 0.99, 'bad': 0.99, 'you': 0.01, 'should': 0.01, 'buy': 0.01, 'a': 0.01, 'mazda': 0.01, 'bmws': 0.01, 'are': 0.01, 'very': 0.01, 'luxurious': 0.01, 'cars': 0.01, 'purchase': 0.01, 'corvette': 0.01, 'drive': 0.01, 'it': 0.01, 'race': 0.01, 'car': 0.01, 'driver': 0.01}
This email is not spam!
{'spam': 0.3333333333333333, 'not': 0.2, 'ham': 0.99, 'one': 0.01, 'million': 0.01, 'voices': 0.01}


Filtering spam in this way can be referred to as Bayesian because it relies on a probabilistic, as opposed to binary, approach.  Rather than follow rules that define something as either spam or not spam, this filter examines the likelihood of a given word appearing in a spam email versus the likelihood of the same word appearing in a legitimate email.  This allows for a more nuanced view of what makes up spam, allowing the filter to better react to the real world.


In [3]:
'''
This section uses the AIMA Bayes network algorithms 
to find conditional probabilities for the given network.

Written by: Chris Walstra, drawing on code written by Keith Vander Linden
Other parts of Homework 2.2 done on paper.
Hand calculated values for reference:
P(Cloudy)
<F=0.5, T=0.5>
P(Sprinkler | Cloudy)
<F=0.9, T=0.1>
P(Cloudy | sprinkler and ~ rain)
<F=0.952, True: 0.0476>
P(WetGrass | cloudy and sprinkler and rain)
<F=0.01, T=0.99>
P(Cloudy | ~ wetgrass)
<F=0.639, T=0.36>
'''

from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

# Utility variables
T, F = True, False

# From AIMA code (probability.py) - Fig. 14.2 - burglary example
weather = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('WetGrass', 'Sprinkler Rain', 
     {(T, T): 0.99, (T, F): 0.9, (F, T): 0.9, (F, F): 0.00})
    ])

print("P(Cloudy)")
print(enumeration_ask('Cloudy', dict(), weather).show_approx())
print("P(Sprinkler | Cloudy)")
print(enumeration_ask('Sprinkler', dict(Cloudy=T), weather).show_approx())
print("P(Cloudy | sprinkler and ~ rain)")
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), weather).show_approx())
print("P(WetGrass | cloudy and sprinkler and rain)")
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), weather).show_approx())
print("P(Cloudy | ~ wetgrass)")
print(enumeration_ask('Cloudy', dict(WetGrass=F), weather).show_approx())


P(Cloudy)
False: 0.5, True: 0.5
P(Sprinkler | Cloudy)
False: 0.9, True: 0.1
P(Cloudy | sprinkler and ~ rain)
False: 0.952, True: 0.0476
P(WetGrass | cloudy and sprinkler and rain)
False: 0.01, True: 0.99
P(Cloudy | ~ wetgrass)
False: 0.639, True: 0.361
