# Coding a spam classifier with naive Bayes

### 1. Imports and pre-processing data

We load the data into a Turi Create SFrame, and then preprocess it by adding a string with the (non-repeated) words in the email.

In [None]:
import turicreate
import numpy as np

In [None]:
import pandas as pd
emails = pd.read_csv('./emails.csv')

In [None]:
#emails = turicreate.SFrame('./emails.csv')

In [None]:
emails[:10]

In [None]:
def process_email(text):
    return list(set(text.split()))

emails['words'] = emails['text'].apply(process_email)

In [None]:
#emails['word_count'] = turicreate.text_analytics.count_words(emails['text'])

In [None]:
emails[:10]

In [None]:
# Calculating the probability that an email is spam
1.0*sum(emails['spam']==1)/len(emails)

### 2. Coding Naive Bayes

We start by counting how many spam and ham emails contain a given word.

We check for the words 'money' and 'easy'.

In [None]:
def count_spam_ham(word):
    email_count = {'spam': 0, 'ham': 0}
    for index, email in emails.iterrows():
        if word in email['words']:
            if email['spam']:
                email_count['spam'] += 1
            else:
                email_count['ham'] += 1
    return email_count

# In case it's a dictionary
'''
def count_spam_ham(word):
    email_count = {'spam': 0, 'ham': 0}
    for email in emails:
        if word in email['word_count']:
            if email['spam']:
                email_count['spam'] += 1
            else:
                email_count['ham'] += 1
    return email_count
'''

In [None]:
print(count_spam_ham('money'))
print(count_spam_ham('easy'))

Now we make a function that takes a number of words. The naive Bayes algorithm goes over all these words, multiplies the probabilities that the email containing them are spam, and ham. Finally, calculates the weighted probabilities using Naive Bayes, and returns the probability that the email is spam.

In [None]:
def prob_spam_bayes(word):
    # Returns the probability that the email is spam given that it contains a word
    spam, ham = count_spam_ham(word)
    if spam==0 and ham==0:
        return 0.5
    return 1.0*spam/(spam+ham)

In [None]:
def prob_spam_naive_bayes(words):
    email_counts = [count_spam_ham(word) for word in words]
    spams = [count['spam'] for count in email_counts]
    hams = [count['ham'] for count in email_counts]
    #print spams
    #print hams
    spam = np.prod([count['spam'] for count in email_counts])
    ham = np.prod([count['ham'] for count in email_counts])
    if spam==0 and ham==0:
        return 0.5
    return 1.0*spam/(spam+ham)

# In case the email comes as a string
def prob_spam_naive_bayes_string(email):
    words = email.split()
    return prob_spam_naive_bayes(words)

### Testing with some sample emails
We verify that for non-spammy words, the classifier gives us small probabilities, and for spammy words it gives us large probabilities.

In [None]:
prob_spam_naive_bayes(['money', 'easy'])

In [None]:
prob_spam_naive_bayes(['mom','friend','school'])

In [None]:
prob_spam_naive_bayes(['prince','viagra'])

In [None]:
prob_spam_naive_bayes_string('hi mom how are you please buy apples')

In [None]:
prob_spam_naive_bayes_string('buy cheap viagra get lottery')

In [None]:
prob_spam_naive_bayes_string('enter in the lottery now win three million dollars')

In [None]:
prob_spam_naive_bayes_string('lets meet at the hotel lobby at nine am tomorrow')

In [None]:
prob_spam_naive_bayes_string('hi mom make easy money')

In [None]:
prob_spam_naive_bayes_string('hi mom')

In [None]:
prob_spam_naive_bayes_string('make easy money')

### 3. Training an efficient model

Our plan is to write a dictionary, and in this dictionary record every word, and its pair of occurrences in spam and ham

In [None]:
model = {}

# Training process
for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [None]:
model

In [None]:
model['lottery']

In [None]:
model['sale']

In [None]:
def predict_bayes(word):
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [None]:
predict_bayes('lottery')

In [None]:
predict_bayes('sale')

In [None]:
def predict_naive_bayes(email):
    words = set(email.split())
    spams = []
    hams = []
    for word in words:
        if word in model:
            spams.append(model[word]['spam'])
            hams.append(model[word]['ham'])
    prod_spams = np.long(np.prod(spams))
    prod_hams = np.long(np.prod(hams))
    return 1.0*prod_spams/(prod_spams + prod_hams)

In [None]:
predict_naive_bayes('hi mom how are you')

In [None]:
predict_naive_bayes('enter the lottery to win three million dollars')

In [None]:
predict_naive_bayes('meet me at the lobby of the hotel at nine am')

In [None]:
predict_naive_bayes('buy cheap lottery easy money now')