In [1]:
"""
Naive Bayes multi-class classifier inplemented from scratch.
Handles zero frequency corrections/smoothing.
"""

'\nNaive Bayes multi-class classifier inplemented from scratch.\nHandles zero frequency corrections/smoothing.\n'

### 0. Setup

In [3]:
import collections
import numpy
import pandas
import pathlib

### 1. Imports and pre-processing data

We load the data into a Pandas DataFrame, then we preprocess it by adding a column with the (non-repeated) lowercase words in the email.

In [5]:
# Environment variables
dir_path = pathlib.Path.cwd()
name_dataset = "emails.csv"

column_emails = "text"
column_words = "words"
column_label = "spam"
label_spam = 1
label_ham = 0

# Read dataset
emails = pandas.read_csv(dir_path / name_dataset)
emails[:10]

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [6]:
# Helpers (Preprocess) =========================================

def split_string_into_unique_words(string):
    return list(set(string.split()))

def process_series_email(series_text):
    """ Converts text to lower-case then returns list of unique words """
    series_words = series_text.copy() # copies original series
    series_words = series_words.str.lower()
    series_words = series_words.apply(split_string_into_unique_words)

    return series_words

In [8]:
emails[column_words] = process_series_email(emails[column_emails])
emails[:10]

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[information, content, market, marketing, brea..."
1,Subject: the stock trading gunslinger fanny i...,1,"[attainder, yes, palfrey, no, is, segovia, pep..."
2,Subject: unbelievable new homes made easy im ...,1,"[at, way, all, complete, 3, 72, no, dorcas, is..."
3,Subject: 4 color printing special request add...,1,"[information, &, pdf, rd, is, format, printabl..."
4,"Subject: do not have money , get software cds ...",1,"[get, cds, all, be, grow, it, old, is, marriag..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[shakedown, miilion, countries, op, is, andman..."
6,Subject: here ' s a hot play in motion homela...,1,"["", aiways, sec, into, applications, rf, risk,..."
7,Subject: save your money buy getting this thin...,1,"[any, get, bed, country, can, be, start, it, i..."
8,Subject: undeliverable : home based business f...,1,"[l, recognized, for, -, 21, q, ;, ims, based, ..."
9,Subject: save your money buy getting this thin...,1,"[any, get, bed, country, can, be, start, it, i..."


### 2. Calculating prior probabilities
Let's calculate the probabilities of seeing a ham or spam email from just the labeled data.

In [14]:
# Helpers (Priors) ========================================

def calculate_frequency_average(series):
    """ Calculates probabilites of occurences of each label out of the entire set """
    try:
        series_averages = series.value_counts() / len(series)
        return series_averages.to_dict()
    except ZeroDivisionError as exception:
        raise exception

In [15]:
# Our label column is boolean, with spam being 1 and ham being 0.
num_emails = len(emails)
counts_label = emails[column_label].value_counts()
num_spam = counts_label[label_spam]
print(counts_label)

print("Number of emails:", num_emails)
print("Number of spam emails:", num_spam)

# Calculating the prior probability an email is spam.
dict_priors = calculate_frequency_average(emails[column_label])
print("Probability of spam:", dict_priors[label_spam])

0    4360
1    1368
Name: spam, dtype: int64
Number of emails: 5728
Number of spam emails: 1368
Probability of spam: 0.2388268156424581


### 3. Training a Naive Bayes model
We need to calculate the text's word frequencies in order to train the model. Our plan is to write a dictionary that records every word, and calculate its pair of occurrences in spam and ham. Sometimes, if we train on new text, we may see a word that we haven't seen before. In order for the math to check out (avoid dividing by zero), we may have to add a tiny number, and we'll use the whole text to cook up this number. 

In [17]:
# Helpers (Dictionaries) =========================================

def merge_dicts(dict1, dict2):
    dictionary = dict1.copy()
    dictionary.update(dict2)
    return dictionary

def construct_frequency_dict_from_series(series_text):
    """ Converts text to lower-case then returns list of unique words """
    series_list_words = series_text.str.lower()
    series_list_words = series_list_words.str.split()
    series_list_words = series_list_words.explode()
    series_counts = series_list_words.value_counts()
    
    return series_counts.to_dict()

def construct_frequency_dict_from_strings(list_strings):
    """ Converts text to lower-case then returns list of unique words """
    string = " ".join(list_strings)
    string = string.lower()
    list_words = string.split()

    return dict(collections.Counter(list_words))

# Model =========================================
def calculate_labeled_frequencies(dict_frequencies_text, dataframe_emails, column_label, column_words):
    """ 
    Constructs a frequency dictionary for list of words 
    Uses a processed dataframe with list words column
    
    Handles zero frequency occurences by adding n(w) / total,
    in which n(w) is the number of occurences of the word
    across all text, and total is the total number of words
    in the text.
    
    Parameter
    ----------
    dict_frequencies_text = { word : n(word) }
    
    """
    list_labels = dataframe_emails[column_label].unique()
    total = sum(dict_frequencies_text.values())
    
    # Doing it this way avoids copy errors with nested dictionaries
    model = {}
    for word in dict_frequencies_text.keys():
        model.setdefault(word, {label : 0 for label in list_labels })

    # Split label column into groups so we can count them directly
    group_labels = dataframe_emails.groupby(column_label)
    for label, label_df in group_labels:
        for list_words in label_df[column_words]:
            for word in list_words:

                model[word][label] += 1
                
    # Handles the zero frequency offset
    for word, dict_frequency in model.items():
        offset = max(dict_frequencies_text[word] / total, 1E-8)
        for key in dict_frequency.keys():
            dict_frequency[key] += offset

    return model

In [19]:
# Calculate text word frequencies and train the model
dict_frequencies_whole_text = construct_frequency_dict_from_series(emails[column_emails])
dict_model = calculate_labeled_frequencies(dict_frequencies_whole_text, emails, column_label, column_words)

In [28]:
# Some examples (1 is spam, and 0 is ham)
print(dict_model['lottery'])
print(dict_model['sale'])
print(dict_model['already'])

{1: 8.000010682682143, 0: 1.0682682143736557e-05}
{1: 38.00005661821536, 0: 41.00005661821536}
{1: 64.0002382238118, 0: 317.0002382238118}


### 3. Using the model to make predictions

In [20]:
def predict_bayes(word, label, dict_frequencies):
    """ 
    Doesn't use the naive assumption.
    likelihood = (num labeled emails with word) / sum(num labeled emails with word for all labels)
               = P(A | Event_j) / sum ( P (A | Event_i) )
    """
    label_count = dict_frequencies[word][label]
    all_label_counts = sum(dict_frequencies[word].values())

    try:
        return label_count / all_label_counts
    except ZeroDivisionError:
        return 0

In [31]:
print(predict_bayes('lottery', label_spam, dict_model))
print(predict_bayes('sale', label_spam, dict_model))
print(predict_bayes('already', label_spam, dict_model))

0.9999986646682983
0.4810126854437434
0.1679794178226178


In [32]:
# Helpers (Probabilities)

def calculate_list_product(list_):
    """ Slightly faster to work on arrays than directly with list """
    return numpy.array(list_).prod()

# Naive Bayes Classifier

def setup_naive_bayes(dict_frequencies_new_text, dataframe_emails, column_label, column_words):
    """ 
    Adds new text to the model, so it can be used to make new predictions 
    This is mostly just an accumulation of the previous cells.
    """
    
    dict_frequencies_whole_text = construct_frequency_dict_from_series(dataframe_emails[column_emails])

    dict_model = calculate_labeled_frequencies(
        merge_dicts(dict_frequencies_whole_text, dict_frequencies_new_text), 
        dataframe_emails, 
        column_label, 
        column_words)

    return dict_model

def calculate_naive_bayes(list_words, dict_frequencies, series_labels):
    list_labels = series_labels.unique()
    counts_label = series_labels.value_counts()
    total = len(series_labels)
    
    # Cook up the total number of emails in each label
    dict_naive_bayes = { label : 1 for label in list_labels }
    for word in list_words:
        for label in list_labels:
            probability = dict_frequencies[word][label] / counts_label[label]            
            if probability == 0:
                print(word)
            dict_naive_bayes[label] *= (probability * total)

    # Multiply by the total number of elements for each label 
    for label in list_labels:
        dict_naive_bayes[label] *= counts_label[label]
    
    return dict_naive_bayes

def predict_naive_bayes(email, label_to_predict, dict_frequencies, series_labels):
    """ Uses the naive assumption to predict on a given email """

    # words
    email = email.lower()
    words = set(email.split())

    dict_naive_bayes = calculate_naive_bayes(words, dict_frequencies, series_labels)
    
    numerator = dict_naive_bayes[label_to_predict]
    denominator = sum(dict_naive_bayes.values())

    return numerator/denominator

In [35]:
list_emails = [
    "lottery sale",
    "Hi mom how are you",
    "Hi MOM how aRe yoU afdjsaklfsdhgjasdhfjklsd",
    "meet me at the lobby of the hotel at nine am",
    "enter the lottery to win three million dollars",
    "buy cheap lottery easy money now",
    "buy cheap lottery easy money"
    "Grokking Machine Learning by Luis Serrano",
    "asdfgh"]

dict_frequencies_new_words = construct_frequency_dict_from_strings(list_emails)
dict_model = setup_naive_bayes(dict_frequencies_new_words, emails, column_label, column_words)
cout = "Probability email is spam: "
for email in list_emails:
    print(cout, predict_naive_bayes(email, label_spam, dict_model, emails[column_label]))

Probability email is spam:  0.9999999005387217
Probability email is spam:  0.09520065375112553
Probability email is spam:  0.25112821625045023
Probability email is spam:  3.410728699614587e-11
Probability email is spam:  0.9999999987178892
Probability email is spam:  0.9999999999343989
Probability email is spam:  0.9999999996071451
Probability email is spam:  0.5000000000000001


### 4. Do our results make sense?
The "Grokking Machine Learning by Luis Serrano" classification was surprising. Or was it? Let's check how often a word like "serrano" appears in spam emails.

In [36]:
print(dict_model['serrano'])
print(predict_bayes('serrano', label_spam, dict_model))

{1: 1.0000005876034477, 0: 5.876034475869477e-07}
0.9999994123972429


Hmm, that seeems pretty high. But, if we look closer at the training data, the following email was labaled spam and has "serrano"!

> Subject: important announcement : your application was approved  we tried to contact you last week about refinancing your home at a lower rate .  i would like to inform you know that you have been pre - approved .  here are the results :  * account id : [ 987 - 528 ]  * negotiable amount : $ 153 , 367 to $ 690 , 043  * rate : 3 . 70 % - 5 . 68 %  please fill out this quick form and we will have a broker contact you as soon as possible .  regards ,  shannon **serrano** senior account manager  lyell national lenders , llc .  database deletion :  www . lend - bloxz . com / r . php

Talk about bad luck