In [25]:
'''
Bayesian Spam Filter
Based on an algorithm by Paul Graham

Written by: Chris Walstra
'''

DEFAULT_PROBABILITY = 0.4

def ParseEmails(freqDict, emails, spam):
    for email in emails:
        for word in email:
            word = word.lower()
            if word in freqDict.keys():
                if spam == 1:
                    hold = freqDict[word][0]
                    counter = freqDict[word][1]
                    counter += 1
                    freqDict[word] = (hold, counter)
                else:
                    hold = freqDict[word][1]
                    counter = freqDict[word][0]
                    counter += 1
                    freqDict[word] = (counter, hold)
            else:
                if spam == 1:
                    freqDict[word] = (0, 1)
                else:
                    freqDict[word] = (1, 0)
    return freqDict

def findProbabilityDict(freqDict, ngood, nbad, threshold=5):
    probabilityDict = {}
    for word in freqDict.keys():
        (g, b) = freqDict[word]
        g = 2*g
        
        if g + b > threshold:
            probabilityDict[word] = max(0.01, min(0.99, min(1.0, b/nbad)/(min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            probabilityDict[word] = DEFAULT_PROBABILITY
    
    return probabilityDict

def isEmailSpam(probDict, email):
    prob = 1
    complement = 1
    for word in email:
        word = word.lower()
        if word in probDict.keys():
            prob = prob * probDict[word]
            complement = complement * (1 - probDict[word])
        else:
            prob = prob * .4
            complement = complement * .6
        
    spamProb = prob/(prob + complement)
    if spamProb > 0.9:
        return True
    else:
        return False

def SpamFilter(testSet):
    frequencyDict = {}

    # Must be a list of emails, where each email is a list of words
    if testSet == "spam": 
        spamEmails = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
        safeEmails = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
        myEmail = ["I", "am", "spam"]
        threshold = 1
    elif testSet == "cars":
        spamEmails = [["Toyota", "is", "super", "fast", "and", "drives", "super", "good"], 
                      ["Honda", "drives", "like", "cloud"],
                      ["Nissan", "is", "bad"]]
        safeEmails = [["You", "should", "buy", "a", "Mazda"],
                      ["BMWs", "are", "very", "luxurious", "cars"],
                      ["You", "should", "purchase", "a", "Corvette", "and", "drive", "it", "like", "a",
                       "race", "car", "driver"]]
        myEmail = ["You", "should", "buy", "an", "Aston", "Martin", "like", "James", "Bond"]
        threshold = 1
    elif testSet == "gibberish":
        spamEmails = [["spam"], ["spam"], ["spam"], ["not", "not", "spam"],
                      ["ham"], ["ham"], ["ham"], ["ham"]]
        safeEmails = [["not", "spam"], ["not", "spam"], ["one", "million", "voices"], ["one", "million", "voices"]]
        threshold = 3
        myEmail = False
  
    frequencyDict = ParseEmails(frequencyDict, spamEmails, 1)
    frequencyDict = ParseEmails(frequencyDict, safeEmails, 0)
    
    probabilityDict = findProbabilityDict(frequencyDict, len(safeEmails), len(spamEmails), threshold)
    
    print(probabilityDict)
    
    if myEmail:
        if isEmailSpam(probabilityDict, myEmail):
            print("This email is spam!")
        else:
            print("This email is not spam!")


def Main():
    for test in ["spam", "cars", "gibberish"]:
        SpamFilter(test)

if __name__ == "__main__":
    Main()




{'not': 0.4, 'green': 0.01, 'spam': 0.99, 'eggs': 0.01, 'am': 0.99, 'do': 0.3333333333333333, 'and': 0.01, 'that': 0.4, 'ham': 0.01, 'i': 0.5, 'like': 0.3333333333333333, 'spamiam': 0.4}
This email is spam!
{'bmws': 0.01, 'is': 0.99, 'good': 0.4, 'purchase': 0.01, 'luxurious': 0.01, 'a': 0.01, 'corvette': 0.01, 'buy': 0.01, 'should': 0.01, 'race': 0.01, 'and': 0.3333333333333333, 'you': 0.01, 'fast': 0.4, 'nissan': 0.4, 'driver': 0.01, 'very': 0.01, 'drives': 0.99, 'honda': 0.4, 'it': 0.01, 'drive': 0.01, 'mazda': 0.01, 'cars': 0.01, 'are': 0.01, 'super': 0.99, 'car': 0.01, 'toyota': 0.4, 'like': 0.3333333333333333, 'cloud': 0.4, 'bad': 0.4}
This email is not spam!
{'not': 0.2, 'ham': 0.99, 'voices': 0.01, 'one': 0.01, 'million': 0.01, 'spam': 0.3333333333333333}
