In [1]:
# Importing library
import math
import random
import csv

In [3]:
# the categorical class names are changed to numberic data
# eg: yes and no encoded to 1 and 0
def encode_class(mydata):
  classes = []
  for i in range(len(mydata)):
      if mydata[i][-1] not in classes:
          classes.append(mydata[i][-1])
  for i in range(len(classes)):
      for j in range(len(mydata)):
          if mydata[j][-1] == classes[i]:
              mydata[j][-1] = i
  return mydata

In [8]:
# add the data path in your system
filename = '/content/drive/MyDrive/Dataset/NaiveBayesDataset/filedata.csv'
 
# load the file and store it in mydata list
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
    mydata[i] = [float(x) for x in mydata[i]]

In [12]:
mydata[:5]

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 0.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 1.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 0.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 1.0],
 [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 0.0]]

In [15]:
# Splitting the data
def splitting(mydata, ratio):
  train_num = int(len(mydata) * ratio)
  train = []
  # initially testset will have all the dataset
  test = list(mydata)
  while len(train) < train_num:
      # index generated randomly from range 0
      # to length of testset
      index = random.randrange(len(test))
      # from testset, pop data rows and put it in train
      train.append(test.pop(index))
  return train, test

In [40]:
# Group the data rows under each class yes or
# no in dictionary eg: dict[yes] and dict[no]
def groupUnderClass(mydata):
  dict = {}
  for i in range(len(mydata)):
      if (mydata[i][-1] not in dict):
          dict[mydata[i][-1]] = []
      dict[mydata[i][-1]].append(mydata[i])
  return dict

In [41]:
# Calculating Mean
def mean(numbers):
  return sum(numbers) / float(len(numbers))

In [42]:
# Calculating Standard Deviation
def std_dev(numbers):
  avg = mean(numbers)
  variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
  return math.sqrt(variance)

In [43]:
def MeanAndStdDev(mydata):
  info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
  # eg: list = [ [a, b, c], [m, n, o], [x, y, z]]
  # here mean of 1st attribute =(a + m + x)/3, mean of 2nd attribute = (b + n + y)/3
  # delete summaries of last class
  del info[-1]
  return info

In [44]:
# find Mean and Standard Deviation under each class
def MeanAndStdDevForClass(mydata):
  info = {}
  dict = groupUnderClass(mydata)
  for classValue, instances in dict.items():
      info[classValue] = MeanAndStdDev(instances)
  return info

In [45]:
# Calculate Gaussian Probability Density Function
def calculateGaussianProbability(x, mean, stdev):
  expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
  return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

In [46]:
# Calculate Class Probabilities
def calculateClassProbabilities(info, test):
  probabilities = {}
  for classValue, classSummaries in info.items():
      probabilities[classValue] = 1
      for i in range(len(classSummaries)):
          mean, std_dev = classSummaries[i]
          x = test[i]
          probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
  return probabilities

In [47]:
# Make prediction - highest probability is the prediction
def predict(info, test):
  probabilities = calculateClassProbabilities(info, test)
  bestLabel, bestProb = None, -1
  for classValue, probability in probabilities.items():
      if bestLabel is None or probability > bestProb:
          bestProb = probability
          bestLabel = classValue
  return bestLabel

In [48]:
# returns predictions for a set of examples
def getPredictions(info, test):
    predictions = []
    for i in range(len(test)):
        result = predict(info, test[i])
        predictions.append(result)
    return predictions

In [49]:
# Accuracy score
def accuracy_rate(test, predictions):
  correct = 0
  for i in range(len(test)):
      if test[i][-1] == predictions[i]:
          correct += 1
  return (correct / float(len(test))) * 100.0

In [50]:
# load the file and store it in mydata list
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
  mydata[i] = [float(x) for x in mydata[i]]

In [51]:
# split ratio = 0.7
# 70% of data is training data and 30% is test data used for testing
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
print("Test examples are: ", len(test_data))

Total number of examples are:  768
Out of these, training examples are:  537
Test examples are:  231


In [52]:
# prepare model
info = MeanAndStdDevForClass(train_data)
info

{0.0: [(5.098958333333333, 3.8932884871696976),
  (140.078125, 32.70475797757457),
  (70.30729166666667, 21.60435163064269),
  (22.947916666666668, 18.125721385415556),
  (104.19791666666667, 137.81352601886414),
  (35.37604166666666, 7.103376452394511),
  (0.5315260416666668, 0.35369583369581603),
  (36.583333333333336, 11.003251938796152)],
 1.0: [(3.3420289855072465, 3.082980558982553),
  (109.73913043478261, 25.558103078330927),
  (68.05507246376811, 18.543001063460604),
  (19.61449275362319, 14.696515915249641),
  (62.947826086956525, 86.29036064206197),
  (30.201449275362318, 7.731145345846974),
  (0.42190144927536266, 0.2859804771007582),
  (31.35072463768116, 12.136884288366094)]}

In [53]:
# test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)

Accuracy of your model is:  71.86147186147186


---

First we need to calculate mean and variance for each column and convert it to numPy array for future calculations:

In [36]:
class GaussianNB_Scratch:
  def calc_statistics(self, features, target):
    '''
    calculate mean, variance for each column and convert to numpy array
    ''' 
    self.mean = features.groupby(target).apply(np.mean).to_numpy()
    self.var = features.groupby(target).apply(np.var).to_numpy()
          
    return self.mean, self.var

  def gaussian_density(self, class_idx, x):     
    '''
    calculate probability from gaussian density function (normally distributed)

    '''
    mean = self.mean[class_idx]
    var = self.var[class_idx]
    numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    prob = numerator / denominator
    return prob

  # prior probabilities
  def calc_prior(self, features, target):
    self.prior = (features.groupby(target).apply(lambda x: len(x))/self.rows).to_numpy()
    return self.prior
      
  # posterior probabilities
  def calc_posterior(self, x):
    posteriors = []
    for i in range(self.count):
        prior = np.log(self.prior[i]) 
        conditional = np.sum(np.log(self.gaussian_density(i, x)))
        posterior = prior + conditional
        posteriors.append(posterior)
    return self.classes[np.argmax(posteriors)]

  def fit(self, features, target):
    # define class variables 
    self.classes = np.unique(target)
    self.count = len(self.classes)
    self.feature_nums = features.shape[1]
    self.rows = features.shape[0]
    
    # calculate statistics    
    self.calc_statistics(features, target)
    self.calc_prior(features, target)
        
  def predict(self, features):
    preds = [self.calc_posterior(f) for f in features.to_numpy()]
    return preds


Import Dataset

In [49]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [38]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

In [39]:
X.shape, y.shape

((150, 2), (150,))

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [41]:
nb_classifier = GaussianNB()

In [42]:
nb_classifier.fit(pd.DataFrame(X_train), y_train)

In [50]:
pred = nb_classifier.predict(pd.DataFrame(X_test))

In [51]:
confusion_matrix(y_test, pred)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  4,  7]])

In [52]:
accuracy_score(y_test, pred)

0.8

In [45]:
from sklearn.naive_bayes import GaussianNB

In [53]:
clf = GaussianNB()
clf.fit(pd.DataFrame(X_train), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [54]:
pred_nb = clf.predict(X_test)

In [55]:
confusion_matrix(y_test, pred_nb)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  1, 10]])

In [57]:
accuracy_score(y_test, pred_nb)

0.9