## Implemented Naive Bayes from Scratch

#### The intuition is to build a two level dictionary using which we calculate probability and classify each test data point.

In [2]:
#Necessary Imports
import numpy as np
from sklearn import datasets
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
#This function constructs a two level dictionary 
#result[class_names][feature_num][feature_value]=no. of items
def fit(X_train,Y_train):
    result={}
    class_values = set(Y_train)
    for current_class in class_values:
        result[current_class] = {}
        result["total_data"] = len(Y_train)
        current_class_rows = (Y_train==current_class)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        num_features = X_train.shape[1]
        result[current_class]["total_count"] = len(Y_train_current)
        for j in range(1,num_features+1):
            result[current_class][j-1] = {}
            all_possible_values = set(X_train[:,j-1])
            for current_value in all_possible_values:
                result[current_class][j-1][current_value] = (X_train_current[:,j-1]==current_value).sum()
    return result

In [20]:
#Read this before the next cell
"""def probability(dictionary,x,current_class):
    output = dictionary[current_class]["total_count"]/dictionary["total_data"]
    num_features = len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj]+1 #LaPlace Correction
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j-1].keys())
        current_xj_prob = count_current_class_with_value_xj/count_current_class
        output = output*current_xj_prob
    return output"""

'def probability(dictionary,x,current_class):\n    output = dictionary[current_class]["total_count"]/dictionary["total_data"]\n    num_features = len(dictionary[current_class].keys())-1\n    for j in range(1,num_features+1):\n        xj = x[j-1]\n        count_current_class_with_value_xj = dictionary[current_class][j][xj]+1 #LaPlace Correction\n        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j-1].keys())\n        current_xj_prob = count_current_class_with_value_xj/count_current_class\n        output = output*current_xj_prob\n    return output'

In [4]:
#Calculating the probability for the current class p = count of current class with value x[j]/count of current class
#The output probability can be negative so take log
def probability(dictionary,x,current_class):
    #probability of choosing this current class
    output = np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"]) 
    num_features = len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j-1][xj]+1 #LaPlace Correction (takes care of zero probability)
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j-1].keys())
        current_xj_prob = np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output = output+current_xj_prob
    return output

In [5]:
#This function calculates the probability for each class and then returns the class with best probability
def predictSinglePoint(dictionary,x):
    classes = dictionary.keys()
    best_p=-1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class=="total_data":
            continue
        p_current_class = probability(dictionary,x,current_class)
        if first_run or p_current_class > best_p:
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [6]:
#This function broadly predicts the class for every datapoint x in x_test
def predict(dictionary,X_test):
    y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary,x)
        y_pred.append(x_class)
    return y_pred

In [7]:
#Labelling the continuous data to discrete values
def makeLabel(column):
    second_limit = column.mean();
    first_limit = 0.5*second_limit
    third_limit = 1.5*second_limit
    for i in range(0,len(column)):
        if column[i] < first_limit:
            column[i]=0
        elif column[i] < second_limit:
            column[i]=1
        elif column[i] < third_limit:
            column[i]=2
        else: 
            column[i]=3
    return column

In [8]:
#Loading the iris dataset 
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [9]:
#Converting the continuous data into discrete values for each column
for i in range(0,X.shape[1]):
    X[:,i] = makeLabel(X[:,i]) 

In [10]:
#Splitting the dataset
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size = 0.25,random_state = 0)

In [11]:
#Getting the dictionay using the fit function that we made above
dictionary = fit(X_train,Y_train)

In [12]:
#Getting the predcited values of y 
y_pred = predict(dictionary,X_test)

In [13]:
#Some Analysis on the classification
print(classification_report(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [14]:
#Importing and splitting the dataset again to test with the inbuilt Gaussian Naive Bayes Classifier
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [15]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size = 0.25,random_state = 0)

In [16]:
#Probability for continuous data:
#Assuming that this probability is a Gaussian curve, we try to find the probability by estimating 
#the mean and variance in Gaussian Naive Bayes method
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,Y_train)
y_pred = clf.predict(X_test)
print(classification_report(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))
#We get 100% accuracy!!

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00         9

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

[[13  0  0]
 [ 0 16  0]
 [ 0  0  9]]
