# Fix

## 1

In [12]:
# Tokenized corpus of spam and nonspam mail. Each list in the lists represent tokens in a message.
spam_corpus = [["i", "am", "spam", "spam", "i", "am"], ["i", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# Receives corpora of spam and nonspam mails respectively
# Returns a hashtable (dictionary) with probabilities given a corpus
def spam_filter(spam_mail_corpus, nonspam_mail_corpus):
    # Number of non-spam and spam messages respectively
    # aka sizes of each corpus
    ngood = len(nonspam_mail_corpus)
    nbad = len(spam_mail_corpus)

    # List of words of each corpus
    # Iterate through each list of lists of words
    good_tokens = [token for message in nonspam_mail_corpus for token in message]
    bad_tokens = [token for message in spam_mail_corpus for token in message]

    # Complete Tokens(Words)
    # Get unique tokens through making the list a set
    # Get the union between the sets
    tokens = list(set(good_tokens) | set(bad_tokens))

    # First Hashtable:
    # A mapping of nonspam tokens to its number of occurrences
    # Map from ham_corpus tokens(words) to number of occurrences (good)
    good = get_token_occurrences_map(nonspam_mail_corpus, good_tokens, tokens)

    # Second Hashtable:
    # A mapping of spam tokens to its number of occurrences
    # Map from spam_corpus tokens(words) to number of occurrences (bad)
    bad = get_token_occurrences_map(spam_mail_corpus, bad_tokens, tokens)

    # Third Hashtable:
    # A mapping of each token to the probability that an
    # email containing it is spam (algorithm given by Graham)
    return get_token_spam_probability_map(good, bad, ngood, nbad, tokens)


# Helper function to get the first and second hashtables
# Returns a dictionary with each token(word) mapping to occurrences
def get_token_occurrences_map(mail_corpus, message_tokens, tokens):
    token_occurrences_map = {}
    for token in tokens:
       token_occurrences_map[token] = message_tokens.count(token)

    return token_occurrences_map

# Helper function to get the third hashtable
# Returns a dictionary mapping each token to the probability that an email containing it is spam
def get_token_spam_probability_map(good_token_occurrences_map, bad_token_occurrences_map, ngood, nbad, tokens):
    token_spam_probability_map = {}
    for token in tokens:
        # Double good map to slightly avoid false-positives
        g = float(2 * good_token_occurrences_map[token])
        b = float(bad_token_occurrences_map[token])
        # Check against min threshold
        if g + b > 0.9:
            # Divide by number of emails
            token_spam_probability_map[token] = max(0.01, min(0.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
        else:
            token_spam_probability_map[token] = 0

    return token_spam_probability_map

# Do the actual filtering given a probability map of words
def is_spam(message, probability_map):
    # Probability of tokens never seen previously
    never_seen_probability = 0.4
    # The product of the elements of probability
    product = 1.0
    # Complement product
    complement_product = 1.0

    # Algorithm given by Paul Graham
    for token in message:
        if token in probability_map:
            probability = probability_map[token]
        else:
            probability = never_seen_probability
        product *= probability
        complement_product *= (1.0 - probability)

    return product / (product + complement_product)

prob_map = spam_filter(spam_corpus, ham_corpus)

### ADDED THIS:
# Print word probabilities
print("Word Probabilities: ")
print(prob_map)

### ADDED THIS:
# Wrapper for filter
def run_this(message):
    # Check if message is an entire corpus
    if (isinstance(message[0], list)):
        # Display probability for each message
        message_probs = {}
        for i in range(len(message)):
            message_probs["Message " + str(i)] = is_spam(message[i], prob_map)
        return message_probs
    else:
        # Get the probability map
        return is_spam(message, prob_map)


# See probability of mail being spam
print("Spam:")
print(run_this(["spam", "spamiam"])) # Spam

print("Ham:")
print(run_this(["green", "eggs"])) # Ham

print("Mix Ham and Spam:")
print(run_this(["spam", "ham"])) # Mix Ham and Spam

print("Unseen word:")
print(run_this(["hehe"])) # Unseen word

print("Unseen word with ham:")
print(run_this(["do", "hehe"])) # Unseen word with ham

print("Unseen word with spam:")
print(run_this(["spam", "hehe"])) # Unseen word with spam

print("Spam message 1:")
print(run_this(spam_corpus[0])) # Spam message 1

print("Spam message 2:")
print(run_this(spam_corpus[1])) # Spam message 2

print("Non-Spam message 1:")
print(run_this(ham_corpus[0])) # Non-spam message 1

print("Non-Spam message 2:")
print(run_this(ham_corpus[1])) # Non-spam message 2

### ADDED THIS:
print("Spam corpus:")
print(run_this(spam_corpus))
print("Ham corpus:")
print(run_this(ham_corpus))

print("DONE")

# TODO
# Actual spam mail message with data from my spam inbox (including a token filter)
# Actual working system (with aggregate filter additions and filter reruns)

Word Probabilities: 
{'am': 0.99, 'spamiam': 0.99, 'ham': 0.01, 'eggs': 0.01, 'green': 0.01, 'like': 0.3333333333333333, 'not': 0.99, 'and': 0.01, 'that': 0.99, 'spam': 0.99, 'do': 0.3333333333333333, 'i': 0.5}
Spam:
0.9998979800040808
Ham:
0.00010201999591920018
Mix Ham and Spam:
0.4999999999999997
Unseen word:
0.4
Unseen word with ham:
0.25
Unseen word with spam:
0.9850746268656716
Spam message 1:
0.9999999895897965
Spam message 2:
0.999995877576386
Non-Spam message 1:
2.6025508824397714e-09
Non-Spam message 2:
0.3333333333333333
Spam corpus:
{'text0': 0.9999999895897965, 'text1': 0.999995877576386}
Ham corpus:
{'text0': 2.6025508824397714e-09, 'text1': 0.3333333333333333}
DONE
