In [1]:
import numpy as np
import pandas as pd

from scipy.optimize import minimize_scalar

import time

In [2]:
class DecisionTree:
    def __init__(self, max_depth=2, min_size=None, func='gini'):
        self.max_depth = max_depth
        self.min_size = min_size
        self.root = None
        self.func = func
        
    def fit(self, dataset, label):
        new_dataset = dataset.copy()
        new_dataset['label'] = label
        
        self.dataset = new_dataset.as_matrix()
        self.label = list(set(label))
        
        if self.min_size is None:
            self.min_size = len(self.dataset)/10
            
        self.root = self._split_tree(self.dataset)
        self._split(self.root, 1)
    
    def predict(self, dataset):
        if self.root is None:
            raise "Decison Tree belum di fit"
            
        rows = dataset.as_matrix()
        
        return np.asarray([self._predict(self.root, row) for row in rows])
            
                
    def _predict(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self._predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict(node['right'], row)
            else:
                return node['right']

    def evaluate(self, test_data):
        pass
    
    def _calculate_gini_index(self, groups):
        instances = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for class_val in self.label:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / instances)
        return gini
    
    def _calculate_sse_func(self, groups):
        sse = 0.0
        for group in groups:
            if len(group) == 0:
                continue
            y_mean = sum(row[-1] for row in group)/len(group)
            sse += sum((row[-1] - y_mean)**2 for row in group)
        return sse
    
    def _calculate_cost(self, groups):
        if self.func == 'gini':
            return self._calculate_gini_index(groups)
        elif self.func == 'sse':
            return self._calculate_sse_func(groups)
        raise 'Unknown function'
    
    def _split_tree(self, dataset):
        b_index, b_value, b_score, b_groups = 999, 999, None, None
        for index in range(len(dataset[0])-1):

            col_data = set([data[index] for data in dataset])
                
            for col in col_data:
                groups = self._test_split(index, col, dataset)
                cost = self._calculate_cost(groups)
                if b_score is None or cost < b_score:
                    b_index, b_value, b_score, b_groups = index, col, cost, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def _test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def _split(self, node, depth):
        left, right = node['groups']
        del(node['groups'])
        # check for a no split
        if not left or not right:
            node['left'] = node['right'] = self._to_terminal(left + right)
            return
        # check for max depth
        if depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return
        # process left child
        if len(left) <= self.min_size:
            node['left'] = self._to_terminal(left)
        else:
            node['left'] = self._split_tree(left)
            self._split(node['left'], depth+1)
        # process right child
        if len(right) <= self.min_size:
            node['right'] = self._to_terminal(right)
        else:
            node['right'] = self._split_tree(right)
            self._split(node['right'], depth+1)
    
    def _to_terminal(self, group):
        if self.func == 'gini':
            return self._to_terminal_gini(group)
        elif self.func == 'sse':
            return self._to_terminal_regression(group)
    
    def _to_terminal_gini(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)
    
    def _to_terminal_regression(self, group):
        return sum(row[-1] for row in group)/len(group)

In [3]:
class GradientBoosting:
    def loss_fun(self, loss_function):
        if loss_function == 'mse':
            return lambda y,yi : (y-yi)
        raise "Unknown loss function"
        
    def __init__(self, iteration, learning_rate=0.1, loss_function='mse',
                max_depth_tree=2, min_size=None):
        self.iteration = iteration
        self.learning_rate = learning_rate
        self.loss_function = self.loss_fun(loss_function)
        self.max_depth_tree = max_depth_tree
        self.min_size = min_size
        self.models = []
        self.gammas = []
        
    def compute_residual(self, pred, label):
        return pred - label
        
    def fit(self, dataset, label):
        label = np.asarray(label)
        
        yi = np.asarray([0.5 for i in range(len(label))])
        y_prev = 0
        
        for i in range(self.iteration):
            print("Iteration  "+ str(i+1))
            begin = time.time()
            model = DecisionTree(self.max_depth_tree, self.min_size, 'sse')
            residual = self.compute_residual(yi, label)
            print("residual :" + str(residual))
            print("Sum of residual : " + str(-sum(residual)))
            model.fit(dataset, residual)
            
            y_predict = model.predict(dataset)
            print("h(x) = " + str(y_predict))
                                    
            yi = yi - self.learning_rate * y_predict
            print("Fm(x) = " + str(yi))
            
            self.models.append(model)
            print("Time : " + str(time.time() - begin) + " s")
            print("accuracy : " + str(self.compute_accuracy(dataset, label, yi)))
            print()
            
    def predict(self, data):
        return (0.5 - sum(self.learning_rate * self.models[i].predict(data) for i in range(len(self.models))))
    
    def compute_accuracy(self, x, y, pred):
        f_bin = np.vectorize(lambda x : 0 if x < 0.5 else 1)
        pred_bin = f_bin(pred)
        return sum(pred_bin == y)/len(y)

In [6]:
df_clean = pd.read_csv("dataset/clean.csv")
df_clean = df_clean[:10000]
y = df_clean["label"]
del df_clean["label"]

NameError: name 'df_clean0' is not defined

In [5]:
model = GradientBoosting(10, 0.1, max_depth_tree=4, min_size=500)
model.fit(df_clean, y)

Iteration  1
residual :[ 0.5 -0.5 -0.5 ... -0.5 -0.5 -0.5]
Sum of residual : 25812.0
h(x) = [-0.10119048 -0.5        -0.06680614 ... -0.5        -0.5
 -0.5       ]
Fm(x) = [0.51011905 0.55       0.50668061 ... 0.55       0.55       0.55      ]
Time : 461.963330745697 s
accuracy : 0.7686734948788408

Iteration  2
residual :[ 0.51011905 -0.45       -0.49331939 ... -0.45       -0.45
 -0.45      ]
Sum of residual : 23230.80000001687
h(x) = [-0.09107143 -0.45       -0.06012552 ... -0.45       -0.45
 -0.45      ]
Fm(x) = [0.51922619 0.595      0.51269317 ... 0.595      0.595      0.595     ]
Time : 560.0277526378632 s
accuracy : 0.7686734948788408

Iteration  3
residual :[ 0.51922619 -0.405      -0.48730683 ... -0.405      -0.405
 -0.405     ]
Sum of residual : 20907.720000005203
h(x) = [-0.08196429 -0.405       0.01443911 ... -0.405      -0.405
 -0.405     ]
Fm(x) = [0.52742262 0.6355     0.51124925 ... 0.6355     0.6355     0.6355    ]
Time : 547.7412214279175 s
accuracy : 0.76948538596052

KeyboardInterrupt: 

In [12]:
df_clean = pd.read_csv("dataset/clean.csv")
df_clean_label_1 = df_clean.loc[df_clean['label'] == 1]
df_clean_label_0 = df_clean.loc[df_clean['label'] == 0]
df_clean_label_1_sample = df_clean_label_1.sample(len(df_clean_label_0))
df_equal = pd.concat([df_clean_label_1_sample, df_clean_label_0])
y = df_equal["label"]
del df_equal["label"]
df_equal

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,race_Other,gender_Female,gender_Male,acetohexamide_Steady,tolbutamide_Steady,troglitazone_Steady,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,change_Ch
61797,7,9,40.000000,0.00000,11,0,2,0,428.00,427.00,...,0,0,1,0,0,0,0,0,0,1
541,4,2,79.000000,0.00000,12,0,0,0,428.00,682.00,...,0,0,1,0,0,0,0,0,0,1
62694,6,3,48.000000,2.00000,9,1,0,0,250.80,731.00,...,0,0,1,0,0,0,0,0,0,1
3393,6,4,68.000000,1.00000,4,0,0,0,428.00,511.00,...,0,0,1,0,0,0,0,0,0,1
39445,2,8,48.000000,2.00000,8,0,0,3,250.11,276.00,...,0,1,0,0,0,0,0,0,0,1
65792,8,3,64.000000,0.00000,27,0,0,2,599.00,41.00,...,0,1,0,0,0,0,0,0,0,0
45444,5,7,56.000000,0.00000,10,1,0,0,276.00,250.42,...,0,0,1,0,0,0,0,0,0,1
40034,8,2,38.000000,0.00000,10,0,0,0,198.00,162.00,...,0,0,1,0,0,0,0,0,0,1
5616,8,4,71.000000,0.00000,26,0,0,0,427.00,428.00,...,0,0,1,0,0,0,0,0,0,0
77141,5,3,52.000000,0.00000,14,0,0,3,276.00,585.00,...,0,1,0,0,0,0,0,0,0,0


In [14]:
model = GradientBoosting(10, 0.1, max_depth_tree=2, min_size=500)
model.fit(df_equal, y)

Iteration  1
residual :[-0.5 -0.5 -0.5 ...  0.5  0.5  0.5]
Sum of residual : -0.0
h(x) = [-0.5       -0.5       -0.5       ...  0.2153672  0.2153672  0.2153672]
Fm(x) = [0.55       0.55       0.55       ... 0.47846328 0.47846328 0.47846328]
Time : 123.79826378822327 s
accuracy : 0.7995410367170627

Iteration  2
residual :[-0.45       -0.45       -0.45       ...  0.47846328  0.47846328
  0.47846328]
Sum of residual : -4.832042543867487e-10
h(x) = [-0.45       -0.45       -0.45       ...  0.17718933  0.17718933
  0.21307201]
Fm(x) = [0.595      0.595      0.595      ... 0.46074435 0.46074435 0.45715608]
Time : 124.62408804893494 s
accuracy : 0.7994060475161987

Iteration  3
residual :[-0.405      -0.405      -0.405      ...  0.46074435  0.46074435
  0.45715608]
Sum of residual : 7.861861717195495e-10
h(x) = [-0.405      -0.405      -0.405      ...  0.17459312  0.17459312
  0.17459312]
Fm(x) = [0.6355     0.6355     0.6355     ... 0.44328504 0.44328504 0.43969677]
Time : 123.3280384540557