In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from sklearn import preprocessing

In [2]:
iris = datasets.load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
def makeLabelled(x_col):
    second_limit = x_col.mean()
    first_limit = second_limit*0.5
    third_limit = second_limit*1.5
    for i in range(len(x_col)):
        if x_col[i]<first_limit:
            x_col[i]=0
        elif x_col[i]<second_limit:
            x_col[i]=1
        elif x_col[i]<third_limit:
            x_col[i]=2
        else:
            x_col[i]=3
    return x_col

In [5]:
for i in range(x.shape[1]):
    x[:,i] = makeLabelled(x[:,i])

In [6]:
x_train,x_test, y_train,y_test = model_selection.train_test_split(x,y,random_state=0)

In [7]:
def fit(x_train,y_train):
    result = {}
    result["total_data"] = len(y_train)
    classes = set(y_train)
    for current_class in classes:
        result[current_class]={}
        x_current_rows = x_train[y_train==current_class]
        result[current_class]["total_count"] = x_current_rows.shape[0]
        num_features = x_current_rows.shape[1]
        for current_feature in range(num_features):
            result[current_class][current_feature] = {}
            labels = set(x_train[:,current_feature])
            for current_label in labels:
                result[current_class][current_feature][current_label] = len(x_current_rows[x_current_rows[:,current_feature]==current_label])
    return result

In [8]:
def probability(x_point,current_class,dictionary):
    prob = (dictionary[current_class]["total_count"])/(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys())-1
    for current_feature in range(num_features):
        num_labels = len(dictionary[current_class][current_feature].keys())-1
        xj = x_point[i]
        num = dictionary[current_class][i][xj]+1
        den = dictionary[current_class]["total_count"]+num_labels
        prob =  prob*(num/den)
    return prob

In [9]:
def classify(x_point,dictionary):
    classes = len(dictionary.keys())-1
    best_p = -1000
    best_class = -1
    for current_class in range(classes):
        p = probability(x_point,current_class,dictionary)
        if p>best_p:
            best_p = p
            best_class = current_class
    return best_class

In [10]:
def pred(x_test,dictionary):
    y_pred = []
    for x_point in x_test:
        prediction = classify(x_point,dictionary)
        y_pred.append(prediction)
    return y_pred

In [11]:
dictionary = fit(x_train,y_train)

In [12]:
dictionary

{'total_data': 112,
 0: {'total_count': 37,
  0: {1.0: 37, 2.0: 0},
  1: {1.0: 6, 2.0: 31},
  2: {0.0: 37, 1.0: 0, 2.0: 0, 3.0: 0},
  3: {0.0: 36, 1.0: 1, 2.0: 0, 3.0: 0}},
 1: {'total_count': 34,
  0: {1.0: 19, 2.0: 15},
  1: {1.0: 28, 2.0: 6},
  2: {0.0: 0, 1.0: 6, 2.0: 28, 3.0: 0},
  3: {0.0: 0, 1.0: 8, 2.0: 25, 3.0: 1}},
 2: {'total_count': 41,
  0: {1.0: 4, 2.0: 37},
  1: {1.0: 26, 2.0: 15},
  2: {0.0: 0, 1.0: 0, 2.0: 24, 3.0: 17},
  3: {0.0: 0, 1.0: 0, 2.0: 4, 3.0: 37}}}

In [13]:
y_pred = pred(x_test,dictionary)

In [14]:
from sklearn.metrics import classification_report,confusion_matrix

In [15]:
confusion_matrix(y_test,y_pred)

array([[13,  0,  0],
       [ 0, 16,  0],
       [ 0,  1,  8]])

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

   micro avg       0.97      0.97      0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

