In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
irisDataset = pd.read_csv('/content/drive/MyDrive/MachineLearning/datasets/IRIS.csv')
irisDataset.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


#Naive Bayes#

In [4]:
X = irisDataset
y = irisDataset.iloc[:,-1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True)
print('N.samples in training set: ', len(X_train))
print('N.samples in test set: ', len(X_test))

N.samples in training set:  120
N.samples in test set:  30


In [6]:
#compute the mean and the variance from the subset divided by class
def get_mean_var(X):
  means = X.groupby(["species"]).mean()
  vars = X.groupby(["species"]).var()
  return means, vars

In [7]:
# Gaussian Probability Density Function
#calculate probability from gaussian density function
#used to compute the probability of a real valued input
def GPDF(x,mean,var):
  sd = np.sqrt(var)
  prob = (np.exp(-1/2 * ((x - mean)/sd)** 2)) / (sd * np.sqrt(2 * np.pi))
  return prob

#prior probability of P(y)
def prior_probability(X):
  prior = (X.groupby("species").count() / X.shape[0]).iloc[:,1]
  return prior

In [8]:
def fit(X):

  preds = []
  means,vars = get_mean_var(X)
  classes = np.unique(X["species"].tolist())
  prior = prior_probability(X)

  for i in X.index:
    cls_likelihood = []
    instance = X.loc[i]
    for cls in classes:
      fts_likelihoods = []
      fts_likelihoods.append(np.log(prior[cls]))
      for col in X.iloc[:,:-1].columns:
        data = instance[col]
        # find the mean of column 'col' that are in class 'cls'
        mean = means[col].loc[cls]
        # find the variance of column 'col' that are in class 'cls'
        var = vars[col].loc[cls]      
        likelihood = GPDF(data, mean, var)
        
        if likelihood != 0:
          likelihood = np.log(likelihood)
        else:
          likelihood = 1/X.shape[0]

        fts_likelihoods.append(likelihood)

      # Calculate posterior
      tot_like = np.sum(fts_likelihoods)
      cls_likelihood.append(tot_like)
    # find largest posterior position
    max_idx = cls_likelihood.index(np.max(cls_likelihood))
    y_pred = classes[max_idx]
    preds.append(y_pred)
  return preds

In [9]:
#Naive bayes accuracy computation 
def accuracy(preds,y):
  y_true = y.values
  tot_correct = np.sum(preds == y_true)
  acc = tot_correct / y_true.shape[0]
  return acc

In [10]:
train_predictions = fit(X_train)
#accuracy on training set
train_acc = accuracy(train_predictions,y_train)
print('Training accuracy: ', train_acc)

Training accuracy:  0.9583333333333334


**Test Phase**

In [11]:
test_predictions = fit(X_test)
#accuracy on training set
test_acc = accuracy(test_predictions,y_test)
print('Test accuracy: ', test_acc)

Test accuracy:  0.9666666666666667


*Comparison with Sklearn model*

In [12]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X,y, test_size=0.2, shuffle=True)
X_train1 = X_train1.drop("species", axis = 1)
X_test1 = X_test1.drop("species", axis = 1)

In [13]:
from sklearn.naive_bayes import GaussianNB

sk_naiveBayes = GaussianNB()
sk_naiveBayes.fit(X_train1, y_train1)
sk_train_preds = sk_naiveBayes.predict(X_train1)
sk_test_preds = sk_naiveBayes.predict(X_test1)

#train accuracy
sk_train_acc = accuracy(sk_train_preds,y_train1)
print('Sk learn Naive Bayes training accuracy: ', sk_train_acc)

#test accuracy
sk_test_acc = accuracy(sk_test_preds,y_test1)
print('Sk learn Naive Bayes test accuracy: ', sk_test_acc)

Sk learn Naive Bayes training accuracy:  0.9666666666666667
Sk learn Naive Bayes test accuracy:  0.9333333333333333
