## Naive Bayes Classifier for text
Classifying emails into Spam or not Spam using Naive Bayes.

#### Data Preprocessing : 
Prepare the data by opening the files and getting all the words in all the files in a single list (bag of words).
We are creating two word lists one for ham and one for spam.

In [3]:
import os
def processFile(eachFile):
    with open(eachFile, "r+") as theFile:
        storeFileData = theFile.readlines()
        wordTagList = []
        for eachLine in storeFileData:
            wordTagList += eachLine.split()
            # or wordTagList += [eachLine.strip()]
    return wordTagList

def processAllFiles(directory):
    finalWordTagList = []
    for eachFile in os.listdir(directory):
        finalWordTagList += processFile(directory+"/"+eachFile)
    return finalWordTagList

#List of all class words
spam = processAllFiles('data/spam')
ham = processAllFiles('data/ham')

In [4]:
print(spam[6])
print(len(spam))
print(ham[6])
print(len(ham))

detected
79088
I
82184


#### Implement the Naive Bayes :
Modify Naie Bayes to perform classification instead of getting probabailities. 
We assume that every email which has >50% chance of being a spam is classified as a spam or else it is a Ham.

In [5]:
def createDictonary(dataList):
    Dictionary = {}
    for data in dataList:
        if data in Dictionary:
            Dictionary[data] +=1
        else:
            Dictionary[data] = 1
    return Dictionary

spam_map = createDictonary(spam)
ham_map = createDictonary(ham)

In [6]:
print(spam_map.items()[0])
print(len(spam_map))


spam_files_count = (len([name for name in os.listdir('data/spam')]))
ham_files_count = (len([name for name in os.listdir('data/ham')]))

spam_word_count = len(spam)
ham_word_count = len(ham)

print(spam_files_count)
print(ham_files_count)

('raining', 1)
10894
500
500


### Laplace Smoothing on Navie Bayes
Laplace Smoothing Applied on line 11

In [7]:
# P(W|C) = (n count(w,c) + 1) / (count(c) + |V|)
temp_dict = spam_map.copy()
temp_dict.update(ham_map)
#print(temp_dict)
V = len(temp_dict)

def calculateConditionalProbability(mail_map, word_count):
    prob_map = {}
    for each_word in mail_map.keys():
        # P(each_word|spam)
        p = float((mail_map[each_word] + 1))/float((word_count + V)) 
        prob_map[each_word] = p
    return prob_map
        
ham_prob_map = calculateConditionalProbability(spam_map,spam_word_count)
spam_prob_map = calculateConditionalProbability(ham_map,ham_word_count)

In [8]:
print(len(ham_prob_map))
print(len(spam_prob_map))

print(ham_prob_map.items()[10])

10894
14000
('looking', 0.00038577193954261296)


In [9]:
# Pre-process the Test Data Files
# List of all Lists is needed
test = processAllFiles('data/test')
print(test[6])
print(len(test))

multiple
19748


In [10]:
strg = "This is a line"
qwe = []
qwe += [strg.strip()]
print(qwe)

['This is a line']


In [11]:
def processTestFile(eachFile):
    with open(eachFile, "r+") as theFile:
        storeFileData = theFile.readlines()
        wordTagList = []
        for eachLine in storeFileData:
            wordTagList += eachLine.split()
            # or wordTagList += [eachLine.strip()]
    return wordTagList

def processTestFiles(directory):
    finalWordTagList = {}
    for eachFile in os.listdir(directory):
        #print(eachFile.split(".")[0])
        finalWordTagList[eachFile.split(".")[0]] = processFile(directory+"/"+eachFile) 
    return finalWordTagList


In [12]:
test = processTestFiles('data/test')
print(test.items()[0])
print(len(test))

('24', ['danger', 'in', 'Zimbabwe', 'This', 'land', 'problem', 'came', 'when', 'Zimbabwean', 'President', 'Mr', 'Robert', 'Mugabe', 'introduce', 'a', 'new', 'land', 'Act', 'Reform', 'wholly', 'affecting', 'the', 'rich', 'white', 'farmers', 'and', 'some', 'few', 'black', 'farmers', 'and', 'this', 'resulted', 'to', 'the', 'killing', 'and', 'mob', 'action', 'by', 'Zimbabwean', 'war', 'veterans', 'and', 'some', 'lunatics', 'in', 'the', 'society', 'In', 'fact', 'a', 'lot', 'of', 'people', 'were', 'killed', 'because', 'of', 'this', 'land', 'reform', 'Act', 'for', 'which', 'my', 'father', 'was', 'one', 'of', 'the', 'victims', 'It', 'is', 'against', 'this', 'background', 'that', 'I', 'and', 'my', 'family', 'fled', 'Zimbabwe', 'for', 'fear', 'of', 'our', 'lives', 'and', 'are', 'currently', 'staying', 'in', 'Holland', 'where', 'we', 'are', 'seeking', 'political', 'asylum', 'and', 'more', 'so', 'have', 'decided', 'to', 'transfer', 'my', 'fathers', 'money', 'to', 'a', 'more', 'reliable', 'foreign'

### Logarithimic Values to reduce noise in probabilities
Apply log to find logpriors and loglikelihood and find the final probability.

In [13]:
# Priors P(spam) & P(Ham)
prob_spam = float(spam_files_count)/float(spam_files_count+ham_files_count)
prob_ham = 1 - prob_spam # because it is binomial Bayes Classification

import math
prior_spam = math.log(prob_spam)
prior_ham = math.log(prob_ham)
    
# P(class|Test1) is propotional to P(class) x P(word1|class) x P(word2|class) x ... x P(word_i|class)
def calculateTestConditionalProb(prior_class, class_prob_map):
    test_prob_map = {}
    for file_no in test:
        p_class_given_word = prior_class 
        for each_word in test[file_no]:
            if(each_word in class_prob_map): # check if word exists in spam or ham class
                p_class_given_word += math.log(class_prob_map[each_word]) 
        test_prob_map[file_no] = p_class_given_word
    return test_prob_map

In [14]:
# Choosing a class : Based on Ham and Spam Prob 
spam_test_prob_map = calculateTestConditionalProb(prob_spam, spam_prob_map)
ham_test_prob_map = calculateTestConditionalProb(prob_ham, ham_prob_map)

print(spam_test_prob_map.keys())
print(ham_test_prob_map.keys())
predictions_map ={}

for file_no in spam_test_prob_map:
    if(spam_test_prob_map[file_no] >= ham_test_prob_map[file_no]):
        predictions_map[file_no] = "Spam"
    else:
        predictions_map[file_no] = "Ham"

['24', '25', '26', '27', '20', '21', '22', '23', '28', '29', '4', '8', '59', '58', '55', '54', '57', '56', '51', '50', '53', '52', '88', '89', '82', '83', '80', '81', '86', '87', '84', '85', '3', '7', '100', '39', '38', '33', '32', '31', '30', '37', '36', '35', '34', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '2', '6', '99', '98', '91', '90', '93', '92', '95', '94', '97', '96', '11', '10', '13', '12', '15', '14', '17', '16', '19', '18', '48', '49', '46', '47', '44', '45', '42', '43', '40', '41', '1', '5', '9', '77', '76', '75', '74', '73', '72', '71', '70', '79', '78']
['24', '25', '26', '27', '20', '21', '22', '23', '28', '29', '4', '8', '59', '58', '55', '54', '57', '56', '51', '50', '53', '52', '88', '89', '82', '83', '80', '81', '86', '87', '84', '85', '3', '7', '100', '39', '38', '33', '32', '31', '30', '37', '36', '35', '34', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '2', '6', '99', '98', '91', '90', '93', '92', '95', '94', '97', '96', '11', '10

In [15]:
print(predictions_map)

{'24': 'Spam', '25': 'Spam', '26': 'Ham', '27': 'Ham', '20': 'Spam', '21': 'Spam', '22': 'Ham', '23': 'Spam', '28': 'Spam', '29': 'Ham', '4': 'Ham', '8': 'Ham', '59': 'Ham', '58': 'Spam', '55': 'Spam', '54': 'Ham', '57': 'Ham', '56': 'Ham', '51': 'Ham', '50': 'Spam', '53': 'Ham', '52': 'Ham', '88': 'Ham', '89': 'Ham', '82': 'Spam', '83': 'Ham', '80': 'Ham', '81': 'Spam', '86': 'Ham', '87': 'Ham', '84': 'Spam', '85': 'Spam', '3': 'Ham', '7': 'Ham', '100': 'Ham', '39': 'Ham', '38': 'Spam', '33': 'Ham', '32': 'Spam', '31': 'Spam', '30': 'Ham', '37': 'Spam', '36': 'Spam', '35': 'Spam', '34': 'Ham', '60': 'Ham', '61': 'Ham', '62': 'Ham', '63': 'Spam', '64': 'Ham', '65': 'Spam', '66': 'Spam', '67': 'Ham', '68': 'Spam', '69': 'Spam', '2': 'Ham', '6': 'Ham', '99': 'Ham', '98': 'Ham', '91': 'Ham', '90': 'Ham', '93': 'Ham', '92': 'Ham', '95': 'Ham', '94': 'Ham', '97': 'Ham', '96': 'Ham', '11': 'Ham', '10': 'Spam', '13': 'Spam', '12': 'Ham', '15': 'Spam', '14': 'Spam', '17': 'Spam', '16': 'Ham', 

In [16]:
def print_table(data, cols, wide):
    '''Prints formatted data on columns of given width.'''
    n, r = divmod(len(data), cols)
    pat = '{{:{}}}'.format(wide)
    line = '\n'.join(pat * cols for _ in range(n))
    last_line = pat * r
    print(line.format(*data))
    print(last_line.format(*data[n*cols:]))


truth_lables = {}
with open("truth_lables.txt", "r+") as theFile:
    lines = theFile.readlines()
    for line in lines:
        words = line.split()
        truth_lables[words[0]] = words[1]
    #truth_lables.append(theFile.readlines())

data = ['Email ID','Classifier Output','Truth']
import pandas as pd
i=1
for file_no in predictions_map:
    data +=[file_no]
    data +=[predictions_map[file_no]]
    data +=[truth_lables[file_no]]
    
print_table(data, 3, 20)

Email ID            Classifier Output   Truth               
24                  Spam                Spam                
25                  Spam                Spam                
26                  Ham                 Ham                 
27                  Ham                 Ham                 
20                  Spam                Spam                
21                  Spam                Spam                
22                  Ham                 Ham                 
23                  Spam                Spam                
28                  Spam                Spam                
29                  Ham                 Ham                 
4                   Ham                 Ham                 
8                   Ham                 Ham                 
59                  Ham                 Ham                 
58                  Spam                Spam                
55                  Spam                Spam                
54                  Ham 

### Confusion Matrix
Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, true positives is C_{1,1} and false positives is C_{0,1}.

In [17]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
y_true = []
y_pred = []
for file_no in predictions_map:
    y_pred.append(predictions_map[file_no])
    y_true.append(truth_lables[file_no])
    
print(len(y_true))
print(len(y_pred))
confusion_matrix(y_true, y_pred)



100
100


array([[50,  0],
       [ 4, 46]])

### Accuracy

In [18]:
#accuracy
accuracy = sum(1 for file_no in predictions_map if predictions_map[file_no] == truth_lables[file_no]) / float(len(predictions_map))
print("{0:.4f}".format(accuracy))

0.9600
