Reference :

https://www.kaggle.com/anniepyim/essential-classification-algorithms-explained

In [117]:
import numpy as np

from scipy.stats import norm
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [118]:
# import iris dataset 
iris = datasets.load_iris()
X = iris.data  
Y = iris.target

# stratify dataset accoring to flower categories
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=4, stratify=Y )

# verify
(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)


[[ 0 40]
 [ 1 40]
 [ 2 40]]


In [121]:
NO_OF_CATEGORIES = len(unique)
NO_OF_FEATURE = X.shape[1]
NO_OF_TEST_DATA = y_test.size

In [124]:
likelihood_pdf_params = np.empty(shape = (3, 4, 2)) # (categories, features, distribution parameters : mean, variance)
prior_probs = np.empty(shape = (3, 1)) # (categories, prior probability)

for c_index in range(0, NO_OF_CATEGORIES) :

  c_data = np.array([x for x in X[np.where(Y==c_index)]]) # (50, 4)
  c_data_cnt = c_data.shape[0] 
  prior_probs[c_index] = c_data_cnt/Y.size

  for f_index in range(0, NO_OF_FEATURE) :
    c_f_data = c_data[:, f_index] # (4,)
    mean = np.mean(c_f_data) # ()
    var = np.var(c_f_data) # ()
    likelihood_pdf_params[c_index][f_index][0] = mean
    likelihood_pdf_params[c_index][f_index][1] = var

print(likelihood_pdf_params)
print(prior_probs)

[[[5.006    0.121764]
  [3.428    0.140816]
  [1.462    0.029556]
  [0.246    0.010884]]

 [[5.936    0.261104]
  [2.77     0.0965  ]
  [4.26     0.2164  ]
  [1.326    0.038324]]

 [[6.588    0.396256]
  [2.974    0.101924]
  [5.552    0.298496]
  [2.026    0.073924]]]
[[0.33333333]
 [0.33333333]
 [0.33333333]]


In [128]:
posterior_probs = np.empty(shape = (NO_OF_TEST_DATA, NO_OF_CATEGORIES)) # (30, 3) = (test data : 20%, categories : 3)

for index in range(0, NO_OF_TEST_DATA) :
  data = x_test[index] # (4,) : Features
  label = y_test[index] # ()

  for c_index in range(0, NO_OF_CATEGORIES) : 
    likelihood_params = likelihood_pdf_params[c_index] # (4, 2) = (featues , dist info = mean, var)
    likelihood_means = likelihood_params[:,0] # (4,) = Features
    likelihood_var = likelihood_params[:,1] # (4,) = Features

    """Naive Bayes"""
    independent_posterior_probs = norm.pdf(x = data, loc = likelihood_means, scale = likelihood_var) # (4, 1)
    p_c = prior_probs[c_index]
    posterior_probs[index][c_index] = np.prod(independent_posterior_probs)*p_c
    
number_of_success = sum(posterior_probs.argmax(axis=1) == y_test)
accuracy = (number_of_success/y_test.size)*100
print(accuracy)


    



96.66666666666667
