In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
#read dataset
def load_dataset(train_set_file_name, test_set_file_name):
        
    train_set = np.loadtxt(train_set_file_name, delimiter=',')
    test_set = np.loadtxt( test_set_file_name, delimiter=',')
    
    dims = np.size(train_set, 1) - 1 #number of col - 1 = dim, last col is class label
   
    train_x = train_set[:, :dims]
    train_y = train_set[:, -1:]
    test_x = test_set[:, :dims]
    test_y = test_set[:, -1:]
        
    return train_x, train_y, test_x, test_y

In [3]:
def decision_tree(train_set_file_name, test_set_file_name):
    #get dataset
    train_x, train_y, test_x, test_y= load_dataset(train_set_file_name, test_set_file_name)
    
    #get numbers of examples in train & test
    m_train = train_x.shape[0]
    m_test = test_x.shape[0]
    
    #normalization
    train_x_normalized = normalize(train_x)
    test_x_normalized = normalize(test_x)

    clf = DecisionTreeClassifier(random_state=0)
    clf = clf.fit(train_x, train_y)
    class_list = clf.classes_
    
    predict = clf.predict(test_x)
    
    acc = accuracy_score(test_y, predict)
    confusion_arr = confusion_matrix(predict, test_y)
    print("Accuracy: " + str(acc*100) + "%")
    
    print("Confusion matrix (col header is actual val, row header is predicted val): ")
    label_list = [str(int(i)) for i in class_list]
    df = pd.DataFrame.from_records(data=confusion_arr, columns=label_list)
    df.index = label_list
    print(df)
    
    return predict, acc, confusion_arr

In [4]:
predict = {}
predict["iris"] = decision_tree('dataset/iris.trn', 'dataset/iris.tst')
print("==============")
predict["fp"] = decision_tree('dataset/fp.trn', 'dataset/fp.tst')
print("==============")
predict["letter"]= decision_tree('dataset/let.trn', 'dataset/let.tst')
print("==============")
predict["optics"]= decision_tree('dataset/opt.trn', 'dataset/opt.tst')
print("==============")
predict["leukemia"]= decision_tree('dataset/ALLAML.trn', 'dataset/ALLAML.tst')

Accuracy: 92.0%
Confusion matrix (col header is actual val, row header is predicted val): 
    0   1   2
0  17   0   0
1   0  15   4
2   0   0  14
Accuracy: 77.5%
Confusion matrix (col header is actual val, row header is predicted val): 
     1  2  3  4  5   6  7  8  9  10  11  12  13  14  15
1   28  1  0  0  0   0  0  0  0   0   0   0   0   0   0
2    0  3  2  0  0   0  0  0  0   0   0   0   0   0   0
3    0  0  8  0  0   0  0  0  0   0   0   0   0   0   0
4    0  0  0  7  0   0  0  0  0   0   0   0   0   0   0
5    0  0  0  0  6   0  0  1  0   0   0   0   0   0   0
6    0  0  0  0  0  11  0  0  1   0   0   0   0   0   0
7    0  0  0  0  1   0  7  0  0   0   1   0   0   0   4
8    0  0  0  0  0   0  0  7  0   0   0   0   0   0   0
9    0  0  0  0  0   0  0  0  6   0   0   0   0   0   0
10   1  0  0  0  0   1  0  0  0   3   1   0   2   0   0
11   0  0  0  0  1   0  0  0  0   0   7   0   1   0   1
12   0  0  0  0  0   1  1  0  0   0   0   7   1   1   0
13   0  0  1  0  1   1  1  0  0   