In [39]:
import numpy as np

In [40]:
# fit function creates the dictionary, takes the training data

In [41]:
def fit(X_train, Y_train):
    result ={}
    # topmost layer of dict
    class_values = set(Y_train) #set finds the distinct values in Y_train 
    
    for current_class in class_values:
        result[current_class] = {}
        
        result["total_data"] = len(Y_train)
        
        current_class_rows = (Y_train == current_class) # will give a T/F array
        X_train_current = X_train[current_class_rows] # wherever you have T, you will get that rows in X_traincurrent
        Y_train_current = Y_train[current_class_rows]
        
        # using feature numbers
        num_features = X_train.shape[1]
        result[current_class]["total_count"] = len(Y_train_current)
        
        for j in range(1, num_features+1):
            result[current_class][j] = {}
            all_possible_values = set(X_train[:, j - 1]) # will give the Jth column
            
            for current_value in all_possible_values:
                # find the count
                result[current_class][j][current_value] = (X_train_current[:, j - 1] == current_value).sum() 
                # gives a T/F nparray
                
    return result
    # return the dictionary
                

In [51]:
def probability(dictionary, x, current_class):
    # 2 components
    output= np.log(dictionary[current_class]["total_count"])  - np.log(dictionary["total_data"]) 
    # number of data points in current class
    
    num_features = len(dictionary[current_class].keys()) - 1 
    for j in range(1, num_features+1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        
        current_xj_prob = np.log(count_current_class_with_value_xj)- np.log(count_current_class)
        output = output + current_xj_prob
    return output


# here, many small prob will be multiplied, resulting in even smaller ->0 output.
# we will take log here, so the multiplied values will be added up when log is taken

In [52]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == "total_data":
            continue
        p_current_class = probability(dictionary, x, current_class)
        if first_run or p_current_class > best_p:
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [53]:
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [54]:
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5*second_limit
    third_limit = 1.5* second_limit
    for i in range(0, len(column)):
        if(column[i] < first_limit):
            column[i] = 0
        elif column[i] < second_limit:
            column[i] = 1
        elif column[i] < third_limit:
            column[i] = 2
        else:
            column[i] = 3
    return column

In [55]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [56]:
for i in range(0, X.shape[-1]):
    X[:, i] = makeLabelled(X[:, i])
# all data changed from continuous to labelled

In [57]:
from sklearn import model_selection 
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.25, random_state = 0)


In [58]:
dictionary = fit(X_train, Y_train)

In [59]:
Y_pred = predict(dictionary, X_test)

In [60]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [61]:
# naive bayes works really well on iris (because it is a small and simple dataset)
# but that might not be the case with every dataset

In [62]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.76      1.00      0.86        16
          2       1.00      0.67      0.80         9

avg / total       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


In [1]:
# go through the documentation of GaussianNB