In [2]:
import torch
import pandas as pd
from sklearn import datasets

In [3]:

class Node():
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value


class XGBoostTree():
    def __init__(self, lambda_ = 1, gamma = 0 , min_sample=2, max_depth=5, impurity_function=None):
        self.min_sample = min_sample
        self.max_depth = max_depth
        self.impurity_function = impurity_function
        self.lambda_ = lambda_
        self.gamma = gamma
        self.root = None
    
    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))

    def _compute_negative_gradient(self, y, p):
        return p - y
    
    def _compute_hessian(self, p):
        return p * (1 - p)
    
    def gain(self, sum_gradient, sum_hessian):
        return 0.5 * (sum_gradient ** 2 / (sum_hessian + self.lambda_))
    
    def _compute_gain(self, y, y_base_pred_left, y_base_pred_right, y_left, y_right):
        p_left = self.sigmoid(y_base_pred_left)
        sum_gradient_left = self._compute_negative_gradient(y_left, p_left).sum()
        sum_hessian_left = self._compute_hessian(p_left).sum()

        p_right = self.sigmoid(y_base_pred_right)
        sum_gradient_right = self._compute_negative_gradient(y_right, p_right).sum()
        sum_hessian_right = self._compute_hessian(p_right).sum()

        gain = (self.gain(sum_gradient_left, sum_hessian_left) + self.gain(sum_gradient_right, sum_hessian_right) 
                - self.gain((sum_gradient_left + sum_gradient_right), (sum_hessian_left + sum_hessian_right)) ) + self.gamma
        return gain



    def _best_split(self, X, y, y_base_pred):
        best_gain = -1
        best_split = {}

        for feature in range(X.shape[1]):
            X_crr = X[:, feature]

            for threshold in torch.unique(X_crr):
                df = torch.concat((X, y.reshape(1, -1).T, y_base_pred.reshape(1, -1).T), dim=1)
                df_left = df[df[:, feature] <= threshold]
                df_right = df[df[:, feature] > threshold]
                if len(df_left) > 0 and len(df_right) > 0:
                    y = df[:, -2]
                    gain = self._compute_gain(y, df_left[:, -1], df_right[:, -1], df_left[:, -2], df_right[:, -2])
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {"feature": feature,
                                      "threshold": threshold,
                                      "data_left": df_left,
                                      "data_right": df_right,
                                      "gain": gain}
        return best_split

    def _build(self, X, y, y_base_pred, depth=0):
        if X.shape[0] >= self.min_sample and depth <= self.max_depth:
            best = self._best_split(X, y, y_base_pred)
            try:
                if best['gain'] > 0:
                    left = self._build(
                        best["data_left"][:, :-2], best["data_left"][:, -2], best['data_left'][:, -1], depth + 1)
                    right = self._build(
                        best["data_right"][:, :-2], best["data_right"][:, -2], best['data_right'][:, -1], depth + 1)
                    return Node(feature=best["feature"],
                                threshold=best["threshold"],
                                data_left=left,
                                data_right=right,
                                gain=best["gain"])
            except:
                pass
        # compute leaf value
        grad = self._compute_negative_gradient(y, self.sigmoid(y_base_pred)).sum()
        hess = self._compute_hessian(self.sigmoid(y_base_pred)).sum()
        leaf_value = - grad / (hess + self.lambda_)
        return Node(

            value=leaf_value
        )

    def fit(self, X, y, y_base_pred):
        X = torch.tensor(X)
        y = torch.tensor(y)
        self.root = self._build(X, y, y_base_pred)

    def _predict(self, X, tree):
        # return leaf value if we are at a leaf node
        if tree.value is not None:
            return tree.value
        # traverse the tree
        feature = tree.feature
        threshold = tree.threshold
        if X[feature] < threshold:
            # go left
            return self._predict(X, tree.data_left)
        # go right
        return self._predict(X, tree.data_right)

    def predict(self, X):
        return torch.tensor([self._predict(x, self.root) for x in X])

    def print_tree(self, current_node, list_feature, nameattr='feature', left_child='data_left', right_child='data_right', indent='', last='updown'):

        if hasattr(current_node, str(nameattr)):
            def name(node): return list_feature[getattr(node, nameattr)] + ", " + str(round(node.threshold.item(), 2)) if getattr(node, nameattr) is not None else getattr(node, "value")
        else:
            def name(node): return str(node)

        up = getattr(current_node, left_child)
        down = getattr(current_node, right_child)

        if up is not None:
            next_last = 'up'
            next_indent = '{0}{1}{2}'.format(
                indent, ' ' if 'up' in last else '|', ' ' * len(str(name(current_node))))
            self.print_tree(up, list_feature, nameattr, left_child,
                    right_child, next_indent, next_last)

        if last == 'up':
            start_shape = '┌'
        elif last == 'down':
            start_shape = '└'
        elif last == 'updown':
            start_shape = ' '
        else:
            start_shape = '├'

        if up is not None and down is not None:
            end_shape = '┤'
        elif up:
            end_shape = '┘'
        elif down:
            end_shape = '┐'
        else:
            end_shape = ''

        print('{0}{1}{2}{3}'.format(
            indent, start_shape, name(current_node), end_shape))

        if down is not None:
            next_last = 'down'
            next_indent = '{0}{1}{2}'.format(
                indent, ' ' if 'down' in last else '|', ' ' * len(str(name(current_node))))
            self.print_tree(down, list_feature, nameattr, left_child,
                    right_child, next_indent, next_last)


In [4]:
class XGBoost():
    def __init__(self):
        self.trees = []
    
    def fit(self, X, y, learning_rate, n_estimators=100, max_depth=5, min_sample=2, gamma=0, lambda_=1):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_sample = min_sample
        self.gamma = gamma
        self.lambda_ = lambda_

        y_base_pred = torch.ones(y.shape[0]) 
        for i in range(self.n_estimators):
            booster = XGBoostTree(max_depth=self.max_depth, min_sample=self.min_sample, gamma=self.gamma, lambda_=self.lambda_)
            booster.fit(X, y, y_base_pred)
            self.trees.append(booster)
            y_base_pred = y_base_pred + learning_rate * booster.predict(X)
    
    def predict_prob(self, X):
        pred = torch.zeros(len(X))
        for tree in self.trees:
            pred += self.learning_rate * tree.predict(X)
        return torch.sigmoid(pred)

    def predict(self, X):
        pred = torch.zeros(len(X))
        for tree in self.trees:
            pred += self.learning_rate * tree.predict(X)
        return torch.round(torch.sigmoid(pred))
        

# Titanic Dataset

In [5]:
path_csv = "data/titanic_modified_dataset.csv"
titanic_data_df = pd.read_csv(path_csv, index_col="PassengerId")
titanic_data_df

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.2500,0,0,0
2,1,1,38.0,1,0,71.2833,1,1,1
3,3,1,26.0,0,0,7.9250,0,2,1
4,1,1,35.0,1,0,53.1000,0,1,1
5,3,0,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0,5,0
888,1,1,19.0,0,0,30.0000,0,2,1
889,3,1,28.0,1,2,23.4500,0,2,0
890,1,0,26.0,0,0,30.0000,1,0,1


In [6]:
# convert to tensor array
titanic_data_arr = torch.tensor(titanic_data_df.values, dtype=torch.float32)

# devide features to X, label to y 
X, y = titanic_data_arr[:, :-1], titanic_data_arr[:, -1]
print(X.shape)
print(y.shape)

torch.Size([891, 8])
torch.Size([891])


In [39]:
#shuffle data
idx = torch.randperm(X.shape[0])
X, y = X[idx], y[idx]

# split data to train and test
TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
#TEST_SIZE = 0.1
TRAIN_SAMPLES = int(TRAIN_SIZE * titanic_data_arr.shape[0])
VAL_SAMPLES = TRAIN_SAMPLES + int(VAL_SIZE * titanic_data_arr.shape[0])

X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES]
X_val, y_val = X[TRAIN_SAMPLES:VAL_SAMPLES], y[TRAIN_SAMPLES:VAL_SAMPLES]
X_test, y_test = X[VAL_SAMPLES:], y[VAL_SAMPLES:]

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

torch.Size([623, 8]) torch.Size([178, 8]) torch.Size([90, 8])
torch.Size([623]) torch.Size([178]) torch.Size([90])


In [70]:
tree = XGBoost()
tree.fit(X_train, y_train, learning_rate=0.001, max_depth=3, n_estimators=10, min_sample=2, gamma=0, lambda_=1)
y_pred = tree.predict(X)
print(sum(y_pred == y) / len(y))

  X = torch.tensor(X)
  y = torch.tensor(y)


tensor(0.6992)


In [62]:
for i in range(2):
    tree.trees[i].print_tree(tree.trees[i].root, list_feature=titanic_data_df.columns[:-1])

                                ┌-3.0792551040649414
                      ┌Age, 74.0┤
                      |         └0.22475241124629974
           ┌Fare, 52.0┤
           |          |                     ┌-0.6360369324684143
           |          |          ┌Fare, 55.9┤
           |          |          |          └0.3714160621166229
           |          └Fare, 57.0┤
           |                     |         ┌0.22475241124629974
           |                     └Age, 17.0┤
           |                               └-2.019266128540039
 Title, 0.0┤
           |                                   ┌1.0936044454574585
           |                      ┌Fare, 146.52┤
           |                      |            └0.36533838510513306
           |           ┌Title, 3.0┤
           |           |          |           ┌-2.0146281719207764
           |           |          └Fare, 27.72┤
           |           |                      └0.15182217955589294
           └Pclass, 2.0┤
              

In [63]:
y_val_pred = tree.predict(X_val)
print(sum(y_val == y_val_pred) / len(y_val_pred)) 

tensor(0.7191)


In [64]:
y_test_pred = tree.predict(X_test)
print(sum(y_test == y_test_pred) / len(y_test_pred))

tensor(0.7111)
