In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

## Read Data

In [2]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) pid.dat]: ')
    tstdata_name = input('Enter the name of test data file [(ex) pidtest.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[trres_col]
        tr_feature = trdata.drop(trres_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

## Decision Tree

In [3]:
class DecisionTree:
    def __init__(self):
        None

    def get_gini(self, y):
        p = np.unique(y, return_counts=True)[1] / len(y)
        t = 1 - np.sum(p ** 2)
        return t

    def get_split(self, sort_x, sort_y):
        fit_idx = 1000
        fit_gini = 2
        fit_y = None
        pre_x = sort_x[0]
        
        for idx, x_val in enumerate(sort_x):
            if x_val != pre_x:
                y_left = sort_y[:idx]
                y_right = sort_y[idx:]

                t1 = self.get_gini(y_left)
                t2 = self.get_gini(y_right)
                
                left_weight = len(y_left) / len(sort_y)
                right_weight = len(y_right) / len(sort_y)
                
                gini = left_weight * t1 + right_weight * t2

                if gini < fit_gini:
                    fit_idx = idx 
                    fit_gini = gini
                    left_y, left_cnt = self.get_node(y_left)
                    right_y, right_cnt = self.get_node(y_right)
                    fit_y = ((left_y, left_cnt), (right_y, right_cnt))
                    
                pre_x = x_val
                
        return fit_idx, fit_gini, fit_y


    def get_node(self, y):
        labels, cnts = np.unique(y, return_counts=True)
        return labels[cnts.argmax()], cnts
    
    def fit(self, x, y):
        col_num = x.shape[1]
        split_lst = []

        for i in range(col_num):
            x_col = x.values[:, i]
            sort_x = x_col[x_col.argsort()]
            sort_y = y[x_col.argsort()]
            
            fit_idx, fit_score, fit_y = self.get_split(sort_x, sort_y)
            split_pt = (sort_x[fit_idx-1] + sort_x[fit_idx])/2
            
            split_lst.append((fit_idx, fit_score, fit_y, split_pt))

        self.best_x_idx = np.argmin([x[1] for x in split_lst], axis=0)
        self.best_idx, self.best_gini, fit_y, self.best_split = split_lst[self.best_x_idx]
        
        (self.left_y, self.left_cnt), (self.right_y, self.right_cnt) = fit_y
        self.y, self.cnt = self.get_node(y)

    def predict(self, x):
        pred = np.zeros(x.shape[0],)
        
        left = x[self.best_x_idx] <= self.best_split
        right = x[self.best_x_idx] > self.best_split
        
        pred[left] = self.left_y
        pred[right] = self.right_y
        return pred
    
    def accuracy(self, y, pred_y):
        acc = np.mean(pred_y == y.values)
        return acc

## Output

In [4]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()

Enter the name of train data file [(ex) pid.dat]: pid.dat
Enter the name of test data file [(ex) pidtest.dat]: pidtest.dat
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 2
Enter the column position of the response variable : [from 1 to p]:8
Does the data have column header? (y/n):n
Enter the output file name to export [(ex) result.txt]:result.txt


In [5]:
dt = DecisionTree()

In [6]:
dt.fit(tr_x, tr_y)

In [7]:
text = f'''Tree Structure
    Node 1: {dt.y} ({dt.cnt[0]}, {dt.cnt[1]})
    Node 2: x{dt.best_x_idx+1} <= {dt.best_split}, {dt.left_y} ({dt.left_cnt[0]}, {dt.left_cnt[1]})
    Node 3: x{dt.best_x_idx+1} > {dt.best_split}, {dt.right_y} ({dt.right_cnt[0]}, {dt.right_cnt[1]})
    
Confusion Matrix (Test)
-----------------------
{confusion_matrix(tst_y, dt.predict(tst_x))}

Model Summary (Test)
--------------------
Overall accuracy = {dt.accuracy(tst_y, dt.predict(tst_x)):.3f}
'''

In [8]:
file = open(out_name, "w") 
file.write(text)
file.close()