In [7]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize

import warnings
warnings.filterwarnings("ignore")

In [2]:
#read dataset
def load_dataset(train_set_file_name, test_set_file_name):
        
    train_set = np.loadtxt(train_set_file_name, delimiter=',')
    test_set = np.loadtxt( test_set_file_name, delimiter=',')
    
    dims = np.size(train_set, 1) - 1 #number of col - 1 = dim, last col is class label
    max_class = int(np.amax(train_set[:, dims])) #max value of label
   
    train_x = train_set[:, :dims]
    train_y = train_set[:, -1:]
    test_x = test_set[:, :dims]
    test_y = test_set[:, -1:]
        
    return train_x, train_y, test_x, test_y, max_class

In [15]:
def naive_bayes(train_set_file_name, test_set_file_name):
    #get dataset
    train_x, train_y, test_x, test_y, max_class = load_dataset(train_set_file_name, test_set_file_name)
    
    #get numbers of examples in train & test
    m_train = train_x.shape[0]
    m_test = test_x.shape[0]
    
    #normalization
    train_x_normalized = normalize(train_x)
    test_x_normalized = normalize(test_x)


    nb = GaussianNB()
    nb.fit( train_x_normalized, train_y)
    predict = nb.predict(test_x_normalized)
    
    acc = accuracy_score(test_y, predict)
    confusion_arr = confusion_matrix(predict, test_y)
    print("Accuracy: " + str(acc*100) + "%")
    
    print("Confusion matrix (col header is actual val, row header is predicted val): ")
    label_list = [str(i) for i in range(max_class+1)]
    df = pd.DataFrame.from_records(data=confusion_arr, index=label_list)
    print(df)
    
    return predict, acc, confusion_arr

In [16]:
predict = {}
predict["iris"] = naive_bayes('dataset/iris.trn', 'dataset/iris.tst')
# print("==============")
# predict["fp"] = naive_bayes('dataset/fp.trn', 'dataset/fp.tst')
# print("==============")
# predict["letter"]= naive_bayes('dataset/let.trn', 'dataset/let.tst')
# print("==============")
# predict["optics"]= naive_bayes('dataset/opt.trn', 'dataset/opt.tst')
print("==============")
predict["leukemia"]= naive_bayes('dataset/ALLAML.trn', 'dataset/ALLAML.tst')

Accuracy: 96.0%
Confusion matrix (col header is actual val, row header is predicted val): 
    0   1   2
0  17   0   0
1   0  14   1
2   0   1  17
Accuracy: 91.17647058823529%
Confusion matrix (col header is actual val, row header is predicted val): 
    0   1
0  13   2
1   1  18
