In [179]:
import numpy as np
import pandas as pd

import time

In [180]:
class DecisionTree:
    def __init__(self, max_depth=2, min_size=None):
        self.max_depth = max_depth
        self.min_size = min_size
        self.root = None
        
    def fit(self, dataset, label):
        new_dataset = dataset.copy()
        new_dataset['label'] = label
        
        self.dataset = new_dataset.as_matrix()
        self.label = list(set(label))
        
        if self.min_size is None:
            self.min_size = len(self.dataset)/10
            
        self.root = self._split_tree(self.dataset)
        self._split(self.root, 1)
    
    def predict(self, dataset):
        if self.root is None:
            raise "Decison Tree belum di fit"
            
        rows = dataset.as_matrix()
        
        return [self._predict(self.root, row) for row in rows]
            
                
    def _predict(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self._predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict(node['right'], row)
            else:
                return node['right']

    def evaluate(self, test_data):
        pass
    
    def _calculate_gini_index(self, groups):
        instances = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for class_val in self.label:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / instances)
        return gini
    
    def _split_tree(self, dataset):
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        for index in range(len(dataset[0])-1):
            print("Index : " + str(index))
            #Check if data is binary
            col_data = [data[index] for data in dataset]
            
            if (len(set(col_data)) == 2):
                groups = self._test_split(index, max(col_data))
                gini = self._calculate_gini_index(groups)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], gini, groups
                continue
                
            for row in dataset:
                groups = self._test_split(index, row[index], dataset)
                gini = self._calculate_gini_index(groups)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], gini, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def _test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def _split(self, node, depth):
        left, right = node['groups']
        del(node['groups'])
        # check for a no split
        if not left or not right:
            node['left'] = node['right'] = self._to_terminal(left + right)
            return
        # check for max depth
        if depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return
        # process left child
        if len(left) <= self.min_size:
            node['left'] = self._to_terminal(left)
        else:
            node['left'] = self._split_tree(left)
            self._split(node['left'], depth+1)
        # process right child
        if len(right) <= self.min_size:
            node['right'] = self._to_terminal(right)
        else:
            node['right'] = self._split_tree(right)
            self._split(node['right'], depth+1)
            
    def _to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)

In [181]:
class GradientBoosting:
    def loss_fun(self, loss_function):
        if loss_function == 'mse':
            return lambda y,yi : 2 * sum(y-yi)
        raise "Unknown loss function"
        
    def __init__(self, iteration, learning_rate=0.01, loss_function='mse',
                max_depth_tree=2, min_size=None):
        self.iteration = iteration
        self.learning_rate = learning_rate
        self.loss_function = self.loss_fun(loss_function)
        self.max_depth_tree = max_depth_tree
        self.min_size = min_size
        self.models = []
        
    def fit(self, dataset, label):
        label = np.asarray(label)
        
        yi = label
        
        for i in range(self.iteration):
            begin = time.time()
            model = DecisionTree(self.max_depth_tree, self.min_size)
            model.fit(dataset, yi)
            se
            y_predict = np.asarray(model.predict(data))
            
            residual = self.loss_function(label, y_predict)
            
            yi = yi - self.learning_rate * residual
            
            self.models.append(model)
            print("Iteration " + str(i+1) + ": " + str(time.time() - begin) + " s")
            
    def predict(self, data):
        data = data.as_matrix()
        return [self._predict(row) for row in data]
        
    def _predict(self, row):
        return np.sign(sum(self.learning_rate * model.predict(row) for model in models))

In [182]:
df_clean = pd.read_csv("dataset/clean.csv")
df_clean = df_clean[:10000]
y = df_clean["label"].map({0 : -1, 1:1})
del df_clean["label"]
df_clean

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,race_Other,gender_Female,gender_Male,acetohexamide_Steady,tolbutamide_Steady,troglitazone_Steady,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,change_Ch
0,0,1,41.0,0.0,1,0,0,0,250.83,276.00,...,0,1,0,0,0,0,0,0,0,0
1,1,3,59.0,0.0,18,0,0,0,276.00,250.01,...,0,1,0,0,0,0,0,0,0,1
2,2,2,11.0,1.0,13,2,0,1,648.00,250.00,...,0,1,0,0,0,0,0,0,0,0
3,3,2,44.0,0.0,16,0,0,0,8.00,250.43,...,0,0,1,0,0,0,0,0,0,1
4,4,1,51.0,1.0,8,0,0,0,197.00,157.00,...,0,0,1,0,0,0,0,0,0,1
5,5,3,31.0,0.0,16,0,0,0,414.00,411.00,...,0,0,1,0,0,0,0,0,0,0
6,6,4,70.0,2.0,21,0,0,0,414.00,411.00,...,0,0,1,0,0,0,0,0,0,1
7,7,5,73.0,3.0,12,0,0,0,428.00,492.00,...,0,0,1,0,0,0,0,0,0,0
8,8,9,68.0,2.0,28,0,0,0,398.00,427.00,...,0,1,0,0,0,0,0,0,0,1
9,9,7,33.0,0.0,18,0,0,0,434.00,198.00,...,0,1,0,0,0,0,0,0,0,1


In [None]:
model = DecisionTree(5)
model.fit(df_clean, y)

Index : 0
Index : 1
Index : 2
Index : 3
Index : 4
