# Naive Bayes

In [32]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [18]:
def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]

    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)

    for item in list_to_remove:
       # remove if numerical. 
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    
    # consider only most 3000 common words in dictionary.
    dictionary = dictionary.most_common(3000)
    return dictionary

In [19]:
TRAIN_DIR = "data/train-mails"
TEST_DIR = "data/test-mails"

dictionary = make_Dictionary(TRAIN_DIR)

In [20]:
len(dictionary)

3000

In [26]:
def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            train_labels[docID] = 0;
            filepathTokens = fil.split('/')
            lastToken = filepathTokens[len(filepathTokens) - 1]
            if lastToken.startswith("spmsg"):
                train_labels[docID] = 1;
                count = count + 1
            docID = docID + 1
    return features_matrix, train_labels

In [27]:
print("reading and processing emails from file.")
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)

reading and processing emails from file.


* Gaussian: It is used in classification and it assumes that features follow a normal distribution.
* Multinomial: It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.
* Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

In [29]:

model = GaussianNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.9653846153846154


In [33]:

model = MultinomialNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.9615384615384616


In [34]:
model = BernoulliNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.7653846153846153
