In [5]:
#Gaussian Naive Bayes Classifier

import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

eps = 1e-9
#for reading and organising data
class getData():
    def __init__(self):
        self.train_x = []
        self.train_y = []
        self.test_x = []
        self.test_y = []
    
    def getTrainData(self,cpath):
        for file in os.listdir(cpath+"/EmailsData/spam-train"):
            with open(cpath+"/EmailsData/spam-train/"+file, 'r') as f:
                self.train_x.append(f.read())
            self.train_y.append(1)
        for file in os.listdir(cpath+"/EmailsData/nonspam-train"):
            with open(cpath+"/EmailsData/nonspam-train/"+file, 'r') as f:
                self.train_x.append(f.read())
            self.train_y.append(0)
            
    def getTestData(self,cpath):
        for file in os.listdir(cpath+"/EmailsData/spam-test"):
            with open(cpath+"/EmailsData/spam-test/"+file, 'r') as f:
                self.test_x.append(f.read())
            self.test_y.append(1)
        for file in os.listdir(cpath+"/EmailsData/nonspam-test"):
            with open(cpath+"/EmailsData/nonspam-test/"+file, 'r') as f:
                self.test_x.append(f.read())
            self.test_y.append(0)

class Nbayes():
    def __init__(self):
        self.vec = TfidfVectorizer()
        self.selK = SelectKBest(mutual_info_classif, k = 50)
        self.gauss = GaussianNB()
        self.predictions = []
        self.train = []
        self.test = []
        self.train_y = []
        self.test_y = []
        self.mean_s = []
        self.mean_ns = []
        self.std_s = []
        self.std_ns = []
    
    def reduce_train(self,train_x, train_y):
        #reduce train data using Tfidf vectoriser
        self.train_y = train_y
        self.train = self.vec.fit_transform(train_x)
        #selecting k best feautres
        self.train = self.selK.fit_transform(self.train,train_y)
        
    def reduce_test(self, test_x, test_y):
        #reduce test data
        self.test_y = test_y
        test = self.vec.transform(test_x)
        #selecting k best feautres
        test = self.selK.transform(test)
        self.test = test
    
    def train_own(self):
        #train own model
        self.train = np.array(self.train.todense())
        
        #calculating mean and standard deviation array for spam
        self.mean_s = np.mean(self.train[:350], axis = 0)
        self.std_s = np.std(self.train[:350],axis = 0)
        
        #calculating mean and standard deviation array for non-spam
        self.mean_ns = np.mean(self.train[350:], axis=0)
        self.std_ns = np.std(self.train[350:], axis = 0)
    
    def predict_own(self):
        self.predictions = []
        
        #converting sparse matrix to dense for calculation
        self.test = np.array(self.test.todense())
        
        #calculating probabilities using gaussian distribution on each feature, while ignoring points where std = 0
        #ignored 1/sqrt(2*pi) term on every point because it is irrelevent for comparison
        prob_s = self.test-self.mean_s
        prob_ns = self.test-self.mean_ns
        for j in range(260):
            for i in range(50):
                if(self.std_s[i]-0.0 <= eps):
                    prob_s[j][i] = 1
                else:
                    prob_s[j][i] = (prob_s[j][i]*prob_s[j][i])/(2.0*self.std_s[i]*self.std_s[i])
                    prob_s[j][i] = (np.exp((-1)*prob_s[j][i]))/(1.0*self.std_s[i])
        
        for j in range(260):
            for i in range(50):
                if(self.std_ns[i]-0.0 <= eps):
                    prob_ns[j][i] = 1
                else:
                    prob_ns[j][i] = (prob_ns[j][i]*prob_ns[j][i])/(2.0*self.std_ns[i] * self.std_ns[i])
                    prob_ns[j][i] = (np.exp((-1)*prob_ns[j][i]))/(1.0*self.std_ns[i])
        
        #taking product of all probabilities
        s = np.prod(prob_s,axis=1)
        ns = np.prod(prob_ns,axis=1)
        
        #comparing probabilities of being spam and non spam, if they are equiprobable then we'll classify it as spam
        for i in range(260):
            if(s[i] >= ns[i]):
                self.predictions.append(1)
            else:
                self.predictions.append(0)
        self.predictions = np.array(self.predictions)
        return (self.getScores(self.predictions))
    
    #Library implementation of Naive bayes
    def NBlib(self):
        self.gauss.fit(self.train.todense(),self.train_y)
        self.predictions = self.gauss.predict(self.test.todense())
        self.predictions = self.predictions.tolist()
        return (self.getScores(self.predictions))
    
    #calculating true positives, false positives, true negatives, false negatives
    def getScores(self,predict):
        tp = 0.0
        tn = 0.0
        fn = 0.0
        fp = 0.0
        for i in range(260):
            if(predict[i]==1 and self.test_y[i]==1):
                tp+=1.0
            elif(predict[i]==1 and self.test_y[i]==0):
                fp+=1.0
            elif(predict[i]==0 and self.test_y[i]==0):
                tn+=1.0
            elif(predict[i]==0 and self.test_y[i]==1):
                fn+=1.0
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = 2/(1/precision + 1/recall)
        accuracy = (tp+tn)/260.0
        return f1,accuracy,tp,fp,tn,fn
        

In [13]:
def main():
    data = getData()
    data.getTrainData(os.getcwd())
    data.getTestData(os.getcwd())
    
    #training of model
    bayes = Nbayes()
    #reducing train data to 700x50 by choosing 50 best features
    bayes.reduce_train(data.train_x,data.train_y)
    bayes.reduce_test(data.test_x,data.test_y)
    
    f1_lib,acc_lib,tp,fp,tn,fn = bayes.NBlib()
    
    bayes.train_own()
    f1_own,acc_own,tp,fp,tn,fn = bayes.predict_own()
        
    print("F1 score")
    print("Own Implementation: ", f1_own, " Library Implementation: ", f1_lib)
    print("\nAccuracy")
    print("Own Implementation: ", acc_own*100, " Library Implementation: ", acc_lib*100)
    print("\n                             Confusion Matrix")
    print("                      Positive Class   ","   Negative Class   ")
    print("Positive Predicted:    ", tp, "                 ", fp)
    print("Positive Predicted:    ", fn, "                 ", tn)    
main()

F1 score
Own Implementation:  0.9230769230769231  Library Implementation:  0.942084942084942

Accuracy
Own Implementation:  92.3076923076923  Library Implementation:  94.23076923076923

                             Confusion Matrix
                      Positive Class       Negative Class   
Positive Predicted:     120.0                   10.0
Positive Predicted:     10.0                   120.0


## Inferences:
1. Both F1 score and Accuracy si reported more on Scikit learn implementation