# Guassian Naive Bayes Classifier 

This classifier can be fit with or without a maximum likelihood estimation which is used to find the posterior probability of a datapoint conditioned on a class. The class priors can also be set in advance. There is also a threshold paramter. 

In [2]:
class naivebayes:
    
    def __init__(self):
        self.var_mean = {}
        self.prob_c = {}
        self.attribute_probabilities = []
        self.t = None
    
    #X and y inputs must be pandas dataframe or numpy arrays    
    def fit(self,X,y, mle = True, priors = [], t = None):
        import numpy as np
        labels = np.unique(y)
        self.t = t
        #estimate class priors with the option to use predefined priors
        if priors == []:
            probabilities_of_c = [float(sum(c == y))/len(y) for c in labels]
        else:
            probabilities_of_c = priors
        #append class probabilities into predefined dictionary
        for n,label in enumerate(labels):
            self.prob_c[label] = probabilities_of_c[n]
        
                   
        #variance and mean for each attribute in a class
        for label in labels:
            self.var_mean[label] = []
            matching = X[np.where(y == label)]
            for i in range(len(matching[0])):
                # m is the "coefficient" for the maximum likelihood estimation of variance
                if mle == True:
                    m = (len(matching.take(i,1)) - 1 )/ len(matching.take(i,1))
                else:
                    m = 1
                var = np.var(matching.take(i,1), ddof = 1) * m
                mean = np.mean(matching.take(i,1))
                self.var_mean[label].append([var,mean])       
                
    #Predicts labels given a set of vectors, either numpy arrays or pandas series
    def gauss_predict(self, vectors):
        import numpy as np
        self.attribute_probabilities = []
        
        #calculate datapoints' probability given a label of class c
        labels = []
        for i in self.prob_c.keys():
            attribute_probability = []
            for n in range(len(vectors[0])):
                var = self.var_mean[i][n][0]
                mean = self.var_mean[i][n][1]
                #I split up the calculation below so that it was tidier/easier to read
                coefficient = 1.0 / ((2.0*np.pi*var)**0.5)
                gauss = lambda x:coefficient * (np.e**(-((x - mean)**2.0)/(2.0*var)))
                #applies the gaussian probability to each datapoint in an attribute
                ap = [gauss(d) for d in vectors.take(n,1)]
                attribute_probability.append(ap)
            #finds the product of all conditionally independent probabilities
            attribute_probability = np.prod(attribute_probability,axis = 0)
            #multiplies each probability by the class prior probability
            attribute_probability = attribute_probability * self.prob_c[i]
            self.attribute_probabilities.append(attribute_probability)
        attribute_probabilities = np.array(self.attribute_probabilities)
        #Prediction 
        classes = [i for i in self.prob_c.keys()]
        if self.t is None:
            for i in range(len(attribute_probabilities[0])):
                if attribute_probabilities[0][i] > attribute_probabilities[1][i]:
                    labels.append(classes[0])
                if attribute_probabilities[0][i] < attribute_probabilities[1][i]:
                    labels.append(classes[1])
            return labels 
        else:
            for i in range(len(attribute_probabilities[0])):
            #this was added for ease of thresholding
                total_prob = sum(attribute_probabilities.take(i,1))
                normalized_ap = attribute_probabilities.take(i,1)/total_prob
            #determine label
                biggest_prob = max(normalized_ap)
                label = int(np.where(attribute_probabilities == max(attribute_probabilities.take(i,1)))[0][0])
            #thresholding
                if biggest_prob < self.t and label == 0: 
                    labels.append(1)    
                else:
                    labels.append(classes[label])
            return labels 
    
    #shows the probabilities for each class for each datapoint
    def predict_proba(self, vectors):
        import numpy as np
        self.gauss_predict(vectors)
        attribute_probabilities = np.array(self.attribute_probabilities)
        normalized_ap = []
        for i in range(len(attribute_probabilities[0])):
            total_prob = sum(attribute_probabilities.take(i,1))
            normalized_ap.append(attribute_probabilities.take(i,1)/total_prob)
        probs = np.array([[i[0],i[1]] for i in normalized_ap])
        return probs
    