# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [72]:
import io
import os

from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dir_names, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)
    # return DataFrame(rows)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))
data.head(10)

Unnamed: 0,message,class
emails/spam\00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
emails/spam\00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
emails/spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
emails/spam\00004.eac8de8d759b7e74154f142194282724,##############################################...,spam
emails/spam\00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,spam
emails/spam\00006.5ab5620d3d7c6c0db76234556a16f6c1,A POWERHOUSE GIFTING PROGRAM You Don't Want To...,spam
emails/spam\00007.d8521faf753ff9ee989122f6816f87d7,Help wanted. We are a 14 year old fortune 500...,spam
emails/spam\00008.dfd941deb10f5eed78b1594b131c9266,<html>\n\n<head>\n\n<title>ReliaQuote - Save U...,spam
emails/spam\00009.027bf6e0b0c4ab34db3ce0ea4bf2edab,TIRED OF THE BULL OUT THERE?\n\nWant To Stop L...,spam
emails/spam\00010.445affef4c70feec58f9198cfbc22997,"Dear ricardo1 ,\n\n\n\n<html>\n\n<body>\n\n<ce...",spam


Let's have a look at that DataFrame:

In [15]:
data.head()

Unnamed: 0,message,class
emails/spam\00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
emails/spam\00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
emails/spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
emails/spam\00004.eac8de8d759b7e74154f142194282724,##############################################...,spam
emails/spam\00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,spam


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [21]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [22]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [96]:
from sklearn.model_selection import train_test_split

def predict_spam(train_df:DataFrame, bodies:list):
    cv = CountVectorizer()
    clsf = MultinomialNB()
    
    message_counts = cv.fit_transform(train_df['message'].values)
    clsf.fit(message_counts, train_df['class'].values)
    pr = clsf.predict(cv.transform(bodies))
    return  DataFrame({'message': bodies, 'class': pr})

def get_test_results(train_df:DataFrame, test_df:DataFrame):
    prediction = predict_spam(train_df, test_df.message)
    messages, real, predicted = [], [], []
    for index, row in test_df.iterrows():
        p, r = prediction['class'][index], row['class']
        if r != p:
            messages.append(row['message'])
            real.append(r)
            predicted.append(p)
    return DataFrame({'message': messages, 'real': real, 'predicted': predicted})

train, test = train_test_split(data, test_size=0.2)

examples = ["Nothing", "off", "adult"]
p = get_test_results(train, test)

print(f"Truth coefficient is: {len(p)/len(test):.4f}")
p.head(20)

Truth coefficient is: 0.0267


Unnamed: 0,message,real,predicted
0,You have been removed from our list.\n\nYou wi...,spam,ham
1,HELLO...By reading the short Summary just belo...,spam,ham
2,Hi -\n\n\n\n( http://club.4tfox.com )\n\n\n\nY...,spam,ham
3,Greetings! \n\n\n\nYou are receiving this lett...,spam,ham
4,"ilug ,\n\n\n\n From;Mr.Michael Kamah and Fami...",spam,ham
5,"Hello, my name is Kelly, I am an 18 year old s...",spam,ham
6,Legal TV Descarmbler\n\n\n\nWant to watch Spor...,spam,ham
7,HABERDAR.COM - HABER VE MEDYA PORTALI\n\nArtýk...,spam,ham
8,Opportunity is knocking. Why?\n\n\n\nBecause m...,spam,ham
9,_/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _...,spam,ham
