# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [22]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

#data = data.append(dataFrameFromDirectory('../../MLCourse/emails/spam', 'spam'))
#data = data.append(dataFrameFromDirectory('../../MLCourse/emails/ham', 'ham'))

# assign numeric values, to represent spam and ham
#   let spam==0 and ham==1
data = data.append(dataFrameFromDirectory('../../MLCourse/emails/spam', '0'))
data = data.append(dataFrameFromDirectory('../../MLCourse/emails/ham', '1'))

Let's have a look at that DataFrame:

In [23]:
data.head()

Unnamed: 0,message,class
../../MLCourse/emails/spam/00249.5f45607c1bffe89f60ba1ec9f878039a,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",0
../../MLCourse/emails/spam/00373.ebe8670ac56b04125c25100a36ab0510,ATTENTION: This is a MUST for ALL Computer Use...,0
../../MLCourse/emails/spam/00214.1367039e50dc6b7adb0f2aa8aba83216,This is a multi-part message in MIME format.\n...,0
../../MLCourse/emails/spam/00210.050ffd105bd4e006771ee63cabc59978,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,0
../../MLCourse/emails/spam/00033.9babb58d9298daa2963d4f514193d7d6,This is the bottom line. If you can GIVE AWAY...,0


## Modify Data for Train Test Split

Need to test classifier without predetermined `spam` and `ham` labels.

In [24]:
from sklearn.model_selection import train_test_split


vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values

# split data up into training and test sets
X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.33, random_state=42)

classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [36]:
# make test data, to test classifier
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]

# transform test data to form of original data
example_counts = vectorizer.transform(examples)

# apply classifier to transformed data
predictions = classifier.predict(example_counts)

#predictions

# convert predictions, from numerical values, back to terms 'spam and 'ham'
predicted_names = []
for val in predictions:
    if val == str(0):
        predicted_names.append('spam')
    elif val == str(1):
        predicted_names.append('ham')

# check results
predicted_names

['spam', 'ham']

## Now Check Classifier on Test Data

See if our classifier properly categorizes the test data we left out of training.

In [56]:
#X_test, y_test 

# transform test data to form of original data
#example_counts = vectorizer.transform(examples)

# apply classifier to transformed data
predictions = classifier.predict(X_test)

#predictions

# initialize counters, to see how many test data points are categorized wrong or correctly
N_correct, N_wrong = 0, 0

if len(y_test) != len(predictions):
    print('error: array lengths do not match')
else:
    for i in range(len(y_test)):
        if y_test[i] == predictions[i]:
            N_correct += 1
        else:
            N_wrong += 1

print('number of test datum catrgorized correctly: ', N_correct)
print('number of test datum catrgorized incorrectly: ', N_wrong)
print('of all ', (N_wrong+N_correct), ' test data point, ', N_correct, ' were categorized correctly; leading to a ratio of: ', (N_correct/(N_wrong+N_correct)))

number of test datum catrgorized correctly:  949
number of test datum catrgorized incorrectly:  41
of all  990  test data point,  949  were categorized correctly; leading to a ratio of:  0.9585858585858585
