# Naive Bayes

A naive Bayes algorithm with a sample application to spam detection.

Naive Bayes classifiers are a class of simple probabilistic classifiers built off of Bayes theorem:

 $$ P(A|B) = \frac{P(B|A)P(A)}{P(B)}.$$
 
The idea being that the probability of class membership is given by the probability of the set of \emph{independent} features it possesses.  Despite its simplicity, Naive Bayes models are typically very accurate and they are highly scalable and fit linear in time.

In [12]:
import re, glob, random, math
import import_ipynb
import ml_tools
from collections import defaultdict, Counter

In [2]:
def tokenize(text):
    text = text.lower()                        # convert to lowercase
    all_words = re.findall('[a-z0-9]+', text)  # extract words
    return set(all_words)                      # remove duplicates

In [3]:
def count_words(training_set):
    '''traning_set consists of pairs (text, is_spam)'''
    
    #create dict of word, classification, count
    counts = defaultdict(lambda: [0,0])        # our own initialization function
    for text, is_spam in training_set:
        for word in tokenize(text):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [4]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    '''turn word_counts into a list of triplets w, p(w|spam) and p(w|~spam)'''
    return [(w,
            (spam+k)/(total_spams+2.*k),
            (non_spam+k)/(total_non_spams+2.*k))
           for w, (spam, non_spam) in counts.items()]


In [5]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [6]:
class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        
        # count spam and non-spam messages
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        # run training data through pipeline
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
        
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [13]:
def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = ml_tools.split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

train_and_test_model(r"./email_data/*/*")

Counter({(False, False): 710, (True, True): 85, (True, False): 49, (False, True): 32})
spammiest_hams [('Species at risk of extinction growing', False, 0.9041860801409064), ('Cell phones coming soon', False, 0.9695274846052802), ('Save up to 70% on international calls!', False, 0.9713530671270301), ('Adam dont job for no one, see.', False, 0.9779303466671266), ('2000+ year old Greek computer reinterpreted', False, 0.9787959553367905)]
hammiest_spams [('I was so scared... my very first DP', True, 3.3740549396526264e-05), ('Re: Hi', True, 0.0011506162534143066), ('*****SPAM*****', True, 0.002331869694849285), ('http://www.efi.ie/', True, 0.008502573360982467), ('Outstanding Opportunities for "Premier Producers"', True, 0.009155648659073444)]
spammiest_words [('year', 0.02837837837837838, 0.0002294630564479119), ('zzzz', 0.02837837837837838, 0.0002294630564479119), ('money', 0.033783783783783786, 0.0002294630564479119), ('systemworks', 0.033783783783783786, 0.0002294630564479119), ('adv',