In [1]:
import numpy as np
import pandas as pd
import itertools

In [2]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) boston_tr.csv]: ')
    tstdata_name = input('Enter the name of test data file [(ex) boston_tst.csv]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[trres_col]
        tr_feature = trdata.drop(trres_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

In [3]:
class CartReg:
    def __init__(self):
        None

    def fit(self, x, y):
        var_lst = []

        for col_idx in range(len(x.columns)):
            if isinstance(x.iloc[0, col_idx], str):
                x_cate = x.iloc[:, col_idx].unique()

                for i in range(len(x_cate)-1):
                    best_var = 1e+10

                    for comb in itertools.combinations(x_cate, i+1):
                        left = list(comb)
                        right = [i for i in x_cate if not i in comb]
                        print(left, right)

                        left_x = x[x.iloc[:, col_idx].isin(left)]
                        right_x = x[x.iloc[:, col_idx].isin(right)]

                        left_y = y[x.iloc[:, col_idx].isin(left)]
                        right_y = y[x.iloc[:, col_idx].isin(right)]

                        left_var = np.var(left_y)
                        right_var = np.var(right_y)

                        left_w = len(left_y) / len(tr_y)
                        right_w = len(right_y) / len(tr_y)

                        total_var = left_w * left_var + right_w * right_var

                        if total_var < best_var:
                            best_var = total_var
                            best_x_left = left_x
                            best_x_right = right_x
                            
                            self.cat_best_y_left = left_y
                            self.cat_best_y_right = right_y
                            
                            self.best_left = left
                            self.best_right = right
                            
                var_lst.append(best_var)

            else:
                sort_index = x.values[:, col_idx].argsort()
                sort_y = y[sort_index]
                sort_x = x.values[:, col_idx][sort_index]

                best_var = 1e+10

                for i in range(1, len(sort_y)-1):
                    left_y = sort_y[:i]
                    right_y = sort_y[i:]

                    left_var = np.var(left_y)
                    right_var = np.var(right_y)

                    left_w = len(left_y) / len(sort_y)
                    right_w = len(right_y) / len(sort_y)

                    total_var = left_w * left_var + right_w * right_var

                    if total_var < best_var:
                        best_var = total_var
                        best_idx = i
                        self.num_best_y_left = left_y
                        self.num_best_y_right = right_y

                var_lst.append(best_var)

                self.best_x_pt = (sort_x[best_idx - 1] + sort_x[best_idx]) / 2
                best_x_idx = sort_index[best_idx]

        best_col_idx = np.argmin(var_lst)
        best_col_name = tr_x.columns[best_col_idx]
        self.best_col_name = best_col_name

    def predict(self, x):
        pred = np.zeros(x.shape[0], )
        
        if isinstance(x[self.best_col_name], str):
            left = x[self.best_col_name].isin(self.best_left)
            right = x[self.best_col_name].isin(self.best_right)

            pred[left] = np.mean(self.cat_best_y_left)
            pred[right] = np.mean(self.cat_best_y_right)
            
            self.best_y_left = self.cat_best_y_left
            self.best_y_right = self.cat_best_y_right

        else:
            left = x[self.best_col_name] < self.best_x_pt
            right = x[self.best_col_name] >= self.best_x_pt

            pred[left] = np.mean(self.num_best_y_left)
            pred[right] = np.mean(self.num_best_y_right)
            
            self.best_y_left = self.num_best_y_left
            self.best_y_right = self.num_best_y_right
            
        return pred

    def rmse(self, y, pred):
        mse = np.mean((y - pred) ** 2)
        return np.sqrt(mse)

    def mae(self, y, pred):
        return np.mean(np.abs(y - pred))

    def mape(self, y, pred):
        return np.mean(np.abs((y - pred) / y))

In [4]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()

Enter the name of train data file [(ex) boston_tr.csv]: boston_tr.csv
Enter the name of test data file [(ex) boston_tst.csv]: boston_tst.csv
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 2
Enter the column position of the response variable : [from 1 to p]:13
Does the data have column header? (y/n):y
Enter the output file name to export [(ex) result.txt]:result.txt


In [5]:
dt_reg = CartReg()

In [6]:
dt_reg.fit(tr_x, tr_y)

In [7]:
pred = dt_reg.predict(tst_x)

In [8]:
text = f'''Tree Structure
    Node 1 : n = {len(tr_y)}, mean = {np.mean(tr_y):.3f}
        Node 2 : {dt_reg.best_col_name} < {dt_reg.best_x_pt}, n = {len(dt_reg.best_y_left)}, mean = {np.mean(dt_reg.best_y_left):.3f}
        Node 3 : {dt_reg.best_col_name} >= {dt_reg.best_x_pt}, n = {len(dt_reg.best_y_right)}, mean = {np.mean(dt_reg.best_y_right):.3f}
        
Prediction Performance (Test)
-----------------------------
    MAE = {dt_reg.mae(tst_y, pred):.3f}
    MAPE = {dt_reg.mape(tst_y, pred):.3f}
    RMSE = {dt_reg.rmse(tst_y, pred):.3f}
'''

In [9]:
text

'Tree Structure\n    Node 1 : n = 343, mean = 21.743\n        Node 2 : lstat < 10.14, n = 150, mean = 27.657\n        Node 3 : lstat >= 10.14, n = 193, mean = 17.148\n        \nPrediction Performance (Test)\n-----------------------------\n    MAE = 5.032\n    MAPE = 0.291\n    RMSE = 6.299\n'

In [10]:
file = open(out_name, "w") 
file.write(text)
file.close()