In [1]:
import numpy as np

In [2]:
def fit(X_train,Y_train):
    result={}
    class_values=set(Y_train)
    for current_class in class_values:
        result[current_class]={}
        result["total_data"]=len(Y_train)
        current_class_rows=(Y_train==current_class)
        X_train_current=X_train[current_class_rows]
        Y_train_current=Y_train[current_class_rows]
        num_features=X_train.shape[1]
        result[current_class]["total_count"]=len(Y_train_current)
        for j in range(1,num_features+1):
            result[current_class][j]={}
            all_possible_values=set(X_train[:,j-1])
            for current_value in all_possible_values:
                result[current_class][j][current_value]=(X_train_current[:,j-1]==current_value).sum()
                
    print(result)      
    return result

In [3]:
def probability(dictionary,x,current_class):
    output=np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"])
    num_feature=len(dictionary[current_class].keys())-1;
    for j in range(1,num_feature+1):
        xj=x[j-1]
        count_current_class_with_value_xj=dictionary[current_class][j][xj]+1
        count_current_class=dictionary[current_class]["total_count"]+len(dictionary[current_class][j].keys())
        current_xj_probability=np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output += current_xj_probability
    return output
        

In [4]:
def predictSinglePoint(dictionary,x):
    classes=dictionary.keys()
    best_p=-1000
    best_class=-1
    first_run=True
    for current_class in classes:
        if(current_class=='total_data'):
            continue
        p_current_class=probability(dictionary,x,current_class)
        if (first_run or p_current_class>best_p):
            best_p=p_current_class
            best_class=current_class
        first_run=False
    return best_class
    

In [5]:
def predict(dictionary,X_test):
    y_pred=[]
    for x in X_test:
        x_class=predictSinglePoint(dictionary,x)
        y_pred.append(x_class)
    return y_pred

In [6]:
def makelabelled(column):
    second=column.mean()
    first=0.5*second
    third=1.5*second
    for i in range(len(column)):
        if(column[i]<first):
            column[i]=0
        elif(column[i]<second):
            column[i]=1
        elif(column[i]<third):
            column[i]=2
        else:
            column[i]=3
    return column
    
    

In [7]:
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
Y=iris.target


In [8]:
for i in range(0,X.shape[-1]):
    X[:,i]=makelabelled(X[:,i])
    

In [9]:
from sklearn import model_selection 
x_train,x_test,y_train,y_test=model_selection.train_test_split(X,Y,random_state=0,test_size=0.25)

In [10]:
dictionary=fit(x_train,y_train)

{0: {'total_count': 37, 1: {1.0: 37, 2.0: 0}, 2: {1.0: 6, 2.0: 31}, 3: {0.0: 37, 1.0: 0, 2.0: 0, 3.0: 0}, 4: {0.0: 36, 1.0: 1, 2.0: 0, 3.0: 0}}, 'total_data': 112, 1: {'total_count': 34, 1: {1.0: 19, 2.0: 15}, 2: {1.0: 28, 2.0: 6}, 3: {0.0: 0, 1.0: 6, 2.0: 28, 3.0: 0}, 4: {0.0: 0, 1.0: 8, 2.0: 25, 3.0: 1}}, 2: {'total_count': 41, 1: {1.0: 4, 2.0: 37}, 2: {1.0: 26, 2.0: 15}, 3: {0.0: 0, 1.0: 0, 2.0: 24, 3.0: 17}, 4: {0.0: 0, 1.0: 0, 2.0: 4, 3.0: 37}}}


In [11]:
y_pred=predict(dictionary,x_test)

In [12]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [16]:
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


In [17]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.00      0.00      0.00        16
           2       0.36      1.00      0.53         9

    accuracy                           0.53        38
   macro avg       0.45      0.62      0.48        38
weighted avg       0.43      0.53      0.44        38

[[11  2  0]
 [ 0  0 16]
 [ 0  0  9]]
