In [9]:
'''
Bayesian Spam Filter
Based on an algorithm by Paul Graham

Written by: Chris Walstra
'''

DEFAULT_PROBABILITY = 0.4

def ParseEmails(freqDict, emails, spam):
    for email in emails:
        for word in email:
            if word in freqDict.keys():
                if spam == 1:
                    hold = freqDict[word][0]
                    counter = freqDict[word][1]
                    counter += 1
                    freqDict[word] = (hold, counter)
                else:
                    hold = freqDict[word][1]
                    counter = freqDict[word][0]
                    counter += 1
                    freqDict[word] = (counter, hold)
            else:
                if spam == 1:
                    freqDict[word] = (0, 1)
                else:
                    freqDict[word] = (1, 0)
    return freqDict

def findProbabilityDict(freqDict, ngood, nbad, threshold=5):
    probabilityDict = {}
    for word in freqDict.keys():
        (g, b) = freqDict[word]
        g = 2*g
        
        if g + b > threshold:
            probabilityDict[word] = max(0.01, min(0.99, min(1.0, b/nbad)/(min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            probabilityDict[word] = DEFAULT_PROBABILITY
    
    return probabilityDict

def isEmailSpam(probDict, email):
    prob = 1
    complement = 1
    for word in email:
        if word in probDict.keys():
            prob = prob * probDict[word]
            complement = complement * (1 - probDict[word])
        else:
            prob = prob * .4
            complement = complement * .6
        
    spamProb = prob/(prob + complement)
    if spamProb > 0.9:
        return True
    else:
        return False

def SpamFilter():
    frequencyDict = {}

    # Must be a list of emails, where each email is a list of words
    spamEmails = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
    safeEmails = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
    #myEmail = ["i", "am", "green", "eggs", "and", "ham"]
    myEmail = ["I", "am", "spam"]
  
    frequencyDict = ParseEmails(frequencyDict, spamEmails, 1)
    frequencyDict = ParseEmails(frequencyDict, safeEmails, 0)
    
    probabilityDict = findProbabilityDict(frequencyDict, len(safeEmails), len(spamEmails), 1)
    
    print(probabilityDict)
    
    if myEmail:
        if isEmailSpam(probabilityDict, myEmail):
            print("This email is spam!")
        else:
            print("This email is not spam!")


if __name__ == "__main__":
    SpamFilter()

{'spamiam': 0.4, 'spam': 0.99, 'eggs': 0.01, 'like': 0.3333333333333333, 'do': 0.3333333333333333, 'ham': 0.01, 'and': 0.01, 'i': 0.01, 'am': 0.99, 'not': 0.4, 'green': 0.01, 'that': 0.4, 'I': 0.99}
This email is spam!
