In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
# Loading dataset and getting the labels as y and features as X
X, y = load_iris(return_X_y=True)

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0
)

### Gaussian Naive Bayes Classifier from Scratch with Python

In [4]:
class GaussianNBClassifier:
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.num_classes = len(self.classes)
        self.num_samples = X.shape[0]
        self.num_features = X.shape[1]
        
        # for p(y) 
        self.priors = np.zeros(self.num_classes)
        
        #for p(X|y)
        self.means = np.zeros((self.num_classes, self.num_features))
        self.variances = np.zeros((self.num_classes, self.num_features))
        
        for i, c in enumerate(self.classes):
            
            # caculating prior, mean and variance for each value of y
            X_cls = X[y == c]
            self.priors[i] = X_cls.shape[0] / X.shape[0]
            self.means[i] = X_cls.mean(axis=0)
            self.variances = X_cls.var(axis=0)
        
    
    def predict(self, X):
        
        # Calculating posterior for each value of y. Then sum the log of prior and posterior 
        # instead of multipy them for fixing the small value issues.
        posteriors = np.zeros((X.shape[0], self.num_classes))
        
        for i, c in enumerate(self.classes):
            prior = np.log(self.priors[i])
            posterior = np.sum(np.log(self._Gaussian_pdf(X, self.means[i], self.variances[i])), axis=1)
            posteriors[:, i] = prior + posterior

        return self.classes[np.argmax(posteriors, axis=1)]
    
    
    def _Gaussian_pdf(self, X, mean, variance):
        return (1 / np.sqrt(2 * np.pi * variance)) * np.exp(-((X - mean)  ** 2) / (2 * variance))

In [5]:
clf = GaussianNBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [6]:
accuracy = np.mean(y_pred == y_test)
print('Accuracy = ', accuracy)

Accuracy =  0.9111111111111111


### Gaussian Naive Bayes Classifier with sklearn

In [7]:
SKL_clf = GaussianNB()
SKL_clf.fit(X_train, y_train)
y_pred_SKL = SKL_clf.predict(X_test)

In [8]:
print('accuracy = ', accuracy_score(y_pred_SKL, y_test)) 

accuracy =  1.0
