In [1]:
import numpy as np
import pandas as pd

In [45]:
class NaiveBayesClassifier():
    def fit(self, X, y):
        ''' Calculate prior for each class '''
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        target_name = y.columns[0]
        
        self._means = np.zeros((n_classes, n_features-1), dtype=np.float64)
        self._vars = np.zeros((n_classes, n_features-1), dtype=np.float64)
        self._priors = np.zeros((n_classes), dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X.query(f"{target_name} == {c}").drop(f"{target_name}", axis=1)
            self._means[idx, :] = X_c.mean(axis=0)
            self._vars[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

        
        print(self._means)
        print(self._vars)
        print(self._priors)
        
    def predict(self, X):
        ''' Calculate posterior for each class given inputs '''
        y_pred = [self._predict_one(x) for x in X.values] 
        return np.array(y_pred)
        
    def _predict_one(self, x):
        posteriors = [] # the posterior probabilities for each class according to this datapoint (x)

        # calculate the posteriors for all classes on this datapoint
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._calculate_likelyhood_gaussian(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)
            print(posterior)

        y_hat = np.argmax(posteriors)
        return self._classes[y_hat] # return the class with the highest posterior prob

    def _calculate_likelyhood_gaussian(self, class_idx, x):
        ''' compute the likelyhoods ( P(x_0|y), P(x_1|y), ...,  P(x_n|y)) using gaussian approximation '''
        mean = self._means[class_idx]
        var = self._vars[class_idx]
        # print(f"mean: {mean} var: {var}")
        numerator = np.exp(-((x[class_idx] - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        print(f"NUM: {numerator} DENOM: {denominator}")
        return numerator / denominator

The classic Iris classification dataset is a good example of the type of dataset that the Gaussian Naive Bayes algorithm works on. It's inputs are all numerical features.

In [46]:
import sklearn.datasets
features = ['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
data = sklearn.datasets.load_iris(as_frame=True)
iris_df = data.frame

In [47]:
from sklearn.model_selection import train_test_split
X, y = iris_df.iloc[:,:-1], iris_df.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [48]:
nb_model = NaiveBayesClassifier()
nb_model.fit(pd.concat([X_train, y_train], axis=1), y_train)

[[4.95294118 3.39411765 1.41764706 0.25588235]
 [5.99117647 2.80294118 4.30588235 1.37058824]
 [6.61891892 2.95675676 5.58648649 2.00810811]]
[[0.10499109 0.14905526 0.02270945 0.0134492 ]
 [0.30022282 0.10453654 0.23814617 0.03729055]
 [0.44046547 0.11918919 0.34064565 0.07687688]]
[0.32380952 0.32380952 0.35238095]


In [49]:
y_pred = nb_model.predict(X_test)

NUM: [7.01001867e-002 1.79445265e-008 4.43829585e-176 0.00000000e+000] DENOM: [0.81220592 0.9677509  0.37774021 0.29069538]
NUM: [1.47486189e-02 5.03273555e-06 9.81573780e-01 3.62495999e-54] DENOM: [1.37344661 0.81044584 1.22324016 0.48404902]
NUM: [1.20751001e-13 1.36061659e-04 2.26388494e-11 1.86534604e-01] DENOM: [1.66358833 0.86538301 1.46298999 0.69500479]
[np.float64(-inf), np.float64(-140.19718911283812), np.float64(-66.262022434772)]
NUM: [8.78865889e-008 1.26122933e-017 9.82061475e-278 0.00000000e+000] DENOM: [0.81220592 0.9677509  0.37774021 0.29069538]
NUM: [4.30853777e-08 9.99958625e-01 8.55608060e-03 1.26503664e-12] DENOM: [1.37344661 0.81044584 1.22324016 0.48404902]
NUM: [2.33853041e-02 6.45821596e-07 4.03360567e-01 9.61877640e-23] DENOM: [1.66358833 0.86538301 1.46298999 0.69500479]
[np.float64(-inf), np.float64(-49.82784316851981), np.float64(-71.03613652700781)]
NUM: [1.12171409e-005 8.84590175e-015 1.02594837e-247 0.00000000e+000] DENOM: [0.81220592 0.9677509  0.3777

  posterior = np.sum(np.log(self._calculate_likelyhood_gaussian(idx, x)))


In [7]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred, y_test))

0.13333333333333333
