# Naive-Bayes-Classifier-Scratch

In [1]:
import numpy as np
from scipy.stats import norm

class NaiveBayesClassifierScratch:
    
    def __init__(self):
        self.class_probs = {}
        self.discrete_probs = {}
        self.continuous_meta = {}
    
    def get_discrete_probs(self,i,j,discrete_col_vals,col_vals):
        probs_dict = {}
        for val in discrete_col_vals:
            probs_dict[f'{i}_{j}_{val}'] = sum([1 if j==val else 0 for j in col_vals])/len(col_vals)
        return probs_dict
    
    def fit(self,X,y):
        X = np.array(X)
        y = np.array(y)
        unique_y = list(set(y))
        
        # Separating continuous and discrete (if Unique values < 10% of number of rows -> Discrete(Assumption))
        discrete_cols = {}
        continuous_cols = []
        for i in range(X.shape[1]):
            unique_vals = list(set(X[:,i]))
            if(len(unique_vals) < 0.1*len(y)):
                discrete_cols[i] = unique_vals
            else:
                continuous_cols.append(i)
        self.discrete_cols = discrete_cols.keys()
        self.continuous_cols = continuous_cols
        for i in unique_y:
            self.class_probs[i] = sum([1 if j==i else 0 for j in y])/len(y)
            
            sub_X = X[[ind for ind,j in enumerate(y) if j==i],:]
            n_rows = sub_X.shape[0]
            n_cols = sub_X.shape[1]
            for j in range(n_cols):
                if(j in discrete_cols.keys()):
                    self.discrete_probs = self.discrete_probs | self.get_discrete_probs(i,j,discrete_cols[j],sub_X[:,j])
                else:
                    self.continuous_meta = self.continuous_meta | {f'{i}_{j}': {'mean': np.mean(sub_X[:,j]),'std': np.std(sub_X[:,j])}}
    
    def predict(self,X):
        X = np.array(X)
        preds = []
        for item in X:
            probs = {}
            for cls in self.class_probs.keys():
                prob = self.class_probs[cls]
                for ind,val in enumerate(item):
                    if(ind in self.discrete_cols):
                        prob *= self.discrete_probs[f'{cls}_{ind}_{val}']
                    else:
                        ## Normal Distribution
                        meta_data = self.continuous_meta[f'{cls}_{ind}']
                        prob *= norm.pdf(val,meta_data['mean'],meta_data['std'])
                probs[cls] = prob
            preds.append(max(probs, key=probs.get))
        return np.array(preds)

In [2]:
model = NaiveBayesClassifierScratch()

In [3]:
from sklearn import datasets

iris = datasets.load_iris()

In [4]:
X = iris['data']
y = iris['target']

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [16]:
model.fit(X_train,y_train)

In [17]:
preds = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(preds,y_test)

0.9777777777777777