First we need to calculate mean and variance for each column and convert it to numPy array for future calculations:

In [36]:
class GaussianNB_Scratch:
  def calc_statistics(self, features, target):
    '''
    calculate mean, variance for each column and convert to numpy array
    ''' 
    self.mean = features.groupby(target).apply(np.mean).to_numpy()
    self.var = features.groupby(target).apply(np.var).to_numpy()
          
    return self.mean, self.var

  def gaussian_density(self, class_idx, x):     
    '''
    calculate probability from gaussian density function (normally distributed)

    '''
    mean = self.mean[class_idx]
    var = self.var[class_idx]
    numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    prob = numerator / denominator
    return prob

  # prior probabilities
  def calc_prior(self, features, target):
    self.prior = (features.groupby(target).apply(lambda x: len(x))/self.rows).to_numpy()
    return self.prior
      
  # posterior probabilities
  def calc_posterior(self, x):
    posteriors = []
    for i in range(self.count):
        prior = np.log(self.prior[i]) 
        conditional = np.sum(np.log(self.gaussian_density(i, x)))
        posterior = prior + conditional
        posteriors.append(posterior)
    return self.classes[np.argmax(posteriors)]

  def fit(self, features, target):
    # define class variables 
    self.classes = np.unique(target)
    self.count = len(self.classes)
    self.feature_nums = features.shape[1]
    self.rows = features.shape[0]
    
    # calculate statistics    
    self.calc_statistics(features, target)
    self.calc_prior(features, target)
        
  def predict(self, features):
    preds = [self.calc_posterior(f) for f in features.to_numpy()]
    return preds


Import Dataset

In [49]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [38]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

In [39]:
X.shape, y.shape

((150, 2), (150,))

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [41]:
nb_classifier = GaussianNB()

In [42]:
nb_classifier.fit(pd.DataFrame(X_train), y_train)

In [50]:
pred = nb_classifier.predict(pd.DataFrame(X_test))

In [51]:
confusion_matrix(y_test, pred)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  4,  7]])

In [52]:
accuracy_score(y_test, pred)

0.8

In [45]:
from sklearn.naive_bayes import GaussianNB

In [53]:
clf = GaussianNB()
clf.fit(pd.DataFrame(X_train), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [54]:
pred_nb = clf.predict(X_test)

In [55]:
confusion_matrix(y_test, pred_nb)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  1, 10]])

In [57]:
accuracy_score(y_test, pred_nb)

0.9