In [61]:
import numpy as np
import pandas as pd

from scipy.optimize import minimize_scalar

import time

In [66]:
class DecisionTree:
    def __init__(self, max_depth=2, min_size=None, func='gini'):
        self.max_depth = max_depth
        self.min_size = min_size
        self.root = None
        self.func = func
        
    def fit(self, dataset, label):
        new_dataset = dataset.copy()
        new_dataset['label'] = label
        
        self.dataset = new_dataset.as_matrix()
        self.label = list(set(label))
        
        if self.min_size is None:
            self.min_size = len(self.dataset)/10
            
        self.root = self._split_tree(self.dataset)
        self._split(self.root, 1)
    
    def predict(self, dataset):
        if self.root is None:
            raise "Decison Tree belum di fit"
            
        rows = dataset.as_matrix()
        
        return np.asarray([self._predict(self.root, row) for row in rows])
            
                
    def _predict(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self._predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict(node['right'], row)
            else:
                return node['right']

    def evaluate(self, test_data):
        pass
    
    def _calculate_gini_index(self, groups):
        instances = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for class_val in self.label:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / instances)
        return gini
    
    def _calculate_sse_func(self, groups):
        sse = 0.0
        for group in groups:
            if len(group) == 0:
                continue
            y_mean = sum(row[-1] for row in group)/len(group)
            sse += sum((row[-1] - y_mean)**2 for row in group)
        return sse
    
    def _calculate_cost(self, groups):
        if self.func == 'gini':
            return self._calculate_gini_index(groups)
        elif self.func == 'sse':
            return self._calculate_sse_func(groups)
        raise 'Unknown function'
    
    def _split_tree(self, dataset):
        b_index, b_value, b_score, b_groups = 999, 999, None, None
        for index in range(len(dataset[0])-1):

            col_data = set([data[index] for data in dataset])
                
            for col in col_data:
                groups = self._test_split(index, col, dataset)
                cost = self._calculate_cost(groups)
                if b_score is None or cost < b_score:
                    b_index, b_value, b_score, b_groups = index, col, cost, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def _test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def _split(self, node, depth):
        left, right = node['groups']
        del(node['groups'])
        # check for a no split
        if not left or not right:
            node['left'] = node['right'] = self._to_terminal(left + right)
            return
        # check for max depth
        if depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return
        # process left child
        if len(left) <= self.min_size:
            node['left'] = self._to_terminal(left)
        else:
            node['left'] = self._split_tree(left)
            self._split(node['left'], depth+1)
        # process right child
        if len(right) <= self.min_size:
            node['right'] = self._to_terminal(right)
        else:
            node['right'] = self._split_tree(right)
            self._split(node['right'], depth+1)
    
    def _to_terminal(self, group):
        if self.func == 'gini':
            return self._to_terminal_gini(group)
        elif self.func == 'sse':
            return self._to_terminal_regression(group)
    
    def _to_terminal_gini(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)
    
    def _to_terminal_regression(self, group):
        return sum(row[-1] for row in group)/len(group)

In [67]:
class GradientBoosting:
    def loss_fun(self, loss_function):
        if loss_function == 'mse':
            return lambda y,yi : (y-yi)
        raise "Unknown loss function"
        
    def __init__(self, iteration, learning_rate=0.1, loss_function='mse',
                max_depth_tree=2, min_size=None):
        self.iteration = iteration
        self.learning_rate = learning_rate
        self.loss_function = self.loss_fun(loss_function)
        self.max_depth_tree = max_depth_tree
        self.min_size = min_size
        self.models = []
        self.gammas = []
        
    def compute_residual(self, pred, label):
        return pred - label
        
    def fit(self, dataset, label):
        label = np.asarray(label)
        
        yi = np.asarray([0.5 for i in range(len(label))])
        y_prev = 0
        
        for i in range(self.iteration):
            print("Iteration  "+ str(i+1))
            begin = time.time()
            model = DecisionTree(self.max_depth_tree, self.min_size, 'sse')
            residual = self.compute_residual(yi, label)
            print("residual :" + str(residual))
            print("Sum of residual : " + str(-sum(residual)))
            model.fit(dataset, residual)
            
            y_predict = model.predict(dataset)
            print("h(x) = " + str(y_p))
                                    
            yi = yi - self.learning_rate * y_predict
            
            self.models.append(model)
            print("Time : " + str(time.time() - begin) + " s")
            print("accuracy : " + str(self.compute_accuracy(dataset, label, yi)))
            print()
            
    def predict(self, data):
        return (0.5 + sum(self.learning_rate * self.models[i].predict(data) for i in range(len(self.models))))
    
    def compute_accuracy(self, x, y, pred):
        f_bin = np.vectorize(lambda x : 0 if x < 0.5 else 1)
        pred_bin = f_bin(pred)
        return sum(pred_bin == y)/len(y)

In [68]:
df_clean = pd.read_csv("dataset/clean.csv")
df_clean = df_clean[:10000]
y = df_clean["label"]
df_clean

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,gender_Female,gender_Male,acetohexamide_Steady,tolbutamide_Steady,troglitazone_Steady,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,change_Ch,label
0,0,1,41.0,0.0,1,0,0,0,250.83,276.00,...,1,0,0,0,0,0,0,0,0,0
1,1,3,59.0,0.0,18,0,0,0,276.00,250.01,...,1,0,0,0,0,0,0,0,1,1
2,2,2,11.0,1.0,13,2,0,1,648.00,250.00,...,1,0,0,0,0,0,0,0,0,1
3,3,2,44.0,0.0,16,0,0,0,8.00,250.43,...,0,1,0,0,0,0,0,0,1,1
4,4,1,51.0,1.0,8,0,0,0,197.00,157.00,...,0,1,0,0,0,0,0,0,1,1
5,5,3,31.0,0.0,16,0,0,0,414.00,411.00,...,0,1,0,0,0,0,0,0,0,1
6,6,4,70.0,2.0,21,0,0,0,414.00,411.00,...,0,1,0,0,0,0,0,0,1,1
7,7,5,73.0,3.0,12,0,0,0,428.00,492.00,...,0,1,0,0,0,0,0,0,0,1
8,8,9,68.0,2.0,28,0,0,0,398.00,427.00,...,1,0,0,0,0,0,0,0,1,1
9,9,7,33.0,0.0,18,0,0,0,434.00,198.00,...,1,0,0,0,0,0,0,0,1,1


In [69]:
model = GradientBoosting(10, 0.2, max_depth_tree=2)
model.fit(df_clean, y)

Iteration  1
residual :[ 0.5 -0.5 -0.5 ...  0.5  0.5 -0.5]
Sum of residual : 2412.0
Time : 19.575669288635254 s
accuracy : 0.7412

Iteration  2
residual :[ 0.66840278 -0.25       -0.33159722 ...  0.53495261  0.53495261
 -0.46504739]
Sum of residual : 1206.0000000000261
Time : 19.825121641159058 s
accuracy : 0.7412

Iteration  3
residual :[ 0.70984692 -0.125      -0.32533346 ...  0.57639674  0.54121637
 -0.45878363]
Sum of residual : 602.9999999999907
Time : 20.408394813537598 s
accuracy : 0.7412

Iteration  4
residual :[ 0.80587419 -0.0625     -0.31766863 ...  0.58406158  0.5488812
 -0.4511188 ]
Sum of residual : 301.49999999996885
Time : 19.804664850234985 s
accuracy : 0.7434

Iteration  5
residual :[ 0.81283814 -0.03125    -0.31070468 ...  0.59102553  0.55584515
 -0.44415485]
Sum of residual : 150.749999999996
Time : 18.91430163383484 s
accuracy : 0.7438

Iteration  6
residual :[ 0.82754381 -0.01654433 -0.29599901 ...  0.57474786  0.53956748
 -0.46043252]
Sum of residual : 75.3750000

In [None]:
for dt in model.models:
    print(dt.root)

In [None]:
a = model.predict(df_clean)

In [32]:
a

array([-0.48612492, -0.65132156, -0.29645797, ..., -0.12056182,
       -0.02779925, -0.10829659])