In [9]:
import pandas as pd 
import numpy as np

In [10]:
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.metrics import confusion_matrix

In [11]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) pid.dat]: ')
    tstdata_name = input('Enter the name of test data file [(ex) pidtest.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[trres_col]
        tr_feature = trdata.drop(trres_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

In [12]:
def train_GB(tr_X, tr_y, B=101, lr=0.1):
    tree_dict = dict()
    y_pred_cumsum = 0
    for i in range(1,B+1):
        if i==1:
            tree_dict[f'Tree{i}'] = DecisionTreeClassifier(max_depth=3, random_state = 19960314)
            tree_dict[f'Tree{i}'].fit(tr_X, tr_y)
            y_pred = tree_dict[f'Tree{i}'].predict_proba(tr_X)
            y_pred_cumsum += y_pred[:,1]
        else:
            tree_dict[f'Tree{i}'] = DecisionTreeRegressor(max_depth=3, random_state = 19960314)
            tree_dict[f'Tree{i}'].fit(tr_X, tr_y-y_pred_cumsum)
            y_pred = tree_dict[f'Tree{i}'].predict(tr_X)
            y_pred_cumsum += lr*y_pred
    return tree_dict

In [13]:
def test_GB(trained_tree_dict, X_test, threshold=0.5, lr=0.1):
    y_pred_cumsum = 0
    for tree_idx, tree in enumerate(trained_tree_dict.values()):
        if tree_idx == 0:
            pred = tree.predict_proba(X_test)[:,1]
        else:
            pred = lr * tree.predict(X_test)
        
        y_pred_cumsum += pred
    
    y_pred_cumsum[y_pred_cumsum>threshold] = 1
    y_pred_cumsum[y_pred_cumsum<=threshold] = 0
    return y_pred_cumsum

In [14]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()


In [15]:
tr_y = tr_y.replace({2:1, 1:0})
tst_y = tst_y.replace({2:1, 1:0})

In [16]:
tree_dict = train_GB(tr_x, tr_y, B=101, lr=0.1)
y_test_pred_cumsum = test_GB(tree_dict, tst_x, threshold=0.5, lr=0.1)

In [17]:
text = f'''Confusion Matrix (Gradient Boosting)
------------------------------------
{confusion_matrix(tst_y, y_test_pred_cumsum)}

Model Summary (Gradient Boosting)
---------------------------------
overall accuracy = {np.mean(y_test_pred_cumsum == tst_y):.3f}
'''

In [18]:
file = open(out_name, "w") 
file.write(text)
file.close()