In [75]:
import torch
import pandas as pd
import numpy as np
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

np.random.seed(42)
torch.random.manual_seed(42)

<torch._C.Generator at 0x7f65404aa490>

# Decision Tree model

In [64]:

class Node():
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value


class DecisionTree():
    def __init__(self, min_sample=2, max_depth=5, impurity_function=None):
        self.min_sample = min_sample
        self.max_depth = max_depth
        self.impurity_function = impurity_function
        self.root = None

    def _entropy(self, y):
        _, counts = torch.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return -(probs * torch.log2(probs)).sum()

    def _information_gain(self, y, y_left, y_right):
        p = y_left.shape[0] / y.shape[0]
        return self._entropy(y) - p * self._entropy(y_left) - (1 - p) * self._entropy(y_right)

    def _gini(self, y):
        _, counts = torch.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1 - (probs ** 2).sum()

    def _gini_gain(self, y, y_left, y_right):
        p = y_left.shape[0] / y.shape[0]
        return self._gini(y) - p * self._gini(y_left) - (1 - p) * self._gini(y_right)

    def _calc_gain(self, y, y_left, y_right):
        if self.impurity_function == "gini":
            return self._gini_gain(y, y_left, y_right)
        elif self.impurity_function == "entropy":
            return self._information_gain(y, y_left, y_right)

    def _best_split(self, X, y):
        best_gain = -1
        best_split = {}

        for feature in range(X.shape[1]):
            X_crr = X[:, feature]

            for threshold in torch.unique(X_crr):
                df = torch.concat((X, y.reshape(1, -1).T), dim=1)
                df_left = df[df[:, feature] <= threshold]
                df_right = df[df[:, feature] > threshold]
                if len(df_left) > 0 and len(df_right) > 0:
                    y = df[:, -1]
                    gain = self._calc_gain(y, df_left[:, -1], df_right[:, -1])
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {"feature": feature,
                                      "threshold": threshold,
                                      "data_left": df_left,
                                      "data_right": df_right,
                                      "gain": gain}
        return best_split

    def _build(self, X, y, depth=0):
        if X.shape[0] >= self.min_sample and depth <= self.max_depth:
            best = self._best_split(X, y)
            try:
                if best['gain'] > 0:
                    left = self._build(
                        best["data_left"][:, :-1], best["data_left"][:, -1], depth + 1)
                    right = self._build(
                        best["data_right"][:, :-1], best["data_right"][:, -1], depth + 1)
                    return Node(feature=best["feature"],
                                threshold=best["threshold"],
                                data_left=left,
                                data_right=right,
                                gain=best["gain"])
            except:
                pass
                #print("Can't split data at some branch, please check your min sample and max depth to have a better result")
        return Node(
            value=Counter(y).most_common(1)[0][0]
        )

    def fit(self, X, y):
        X = torch.tensor(X)
        y = torch.tensor(y)
        self.root = self._build(X, y)

    def _predict(self, X, tree):
        # return leaf value if we are at a leaf node
        if tree.value is not None:
            return tree.value
        # traverse the tree
        feature = tree.feature
        threshold = tree.threshold
        if X[feature] < threshold:
            # go left
            return self._predict(X, tree.data_left)
        # go right
        return self._predict(X, tree.data_right)

    def predict(self, X):
        return torch.tensor([self._predict(x, self.root) for x in X])

    def print_tree(self, current_node, list_feature, nameattr='feature', left_child='data_left', right_child='data_right', indent='', last='updown'):

        if hasattr(current_node, str(nameattr)):
            def name(node): return list_feature[getattr(node, nameattr)] + ", " + str(round(node.threshold.item(), 2)) if getattr(node, nameattr) is not None else getattr(node, "value")
        else:
            def name(node): return str(node)

        up = getattr(current_node, left_child)
        down = getattr(current_node, right_child)

        if up is not None:
            next_last = 'up'
            next_indent = '{0}{1}{2}'.format(
                indent, ' ' if 'up' in last else '|', ' ' * len(str(name(current_node))))
            self.print_tree(up, list_feature, nameattr, left_child,
                    right_child, next_indent, next_last)

        if last == 'up':
            start_shape = '┌'
        elif last == 'down':
            start_shape = '└'
        elif last == 'updown':
            start_shape = ' '
        else:
            start_shape = '├'

        if up is not None and down is not None:
            end_shape = '┤'
        elif up:
            end_shape = '┘'
        elif down:
            end_shape = '┐'
        else:
            end_shape = ''

        print('{0}{1}{2}{3}'.format(
            indent, start_shape, name(current_node), end_shape))

        if down is not None:
            next_last = 'down'
            next_indent = '{0}{1}{2}'.format(
                indent, ' ' if 'down' in last else '|', ' ' * len(str(name(current_node))))
            self.print_tree(down, list_feature, nameattr, left_child,
                    right_child, next_indent, next_last)


# Random Forest Model

In [76]:
class RandomForest():
    def __init__(self, n_trees=10, min_sample=2, max_depth=5, impurity_function="entropy"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_sample = min_sample
        self.impurity_function = impurity_function
        # Store all decisions trees
        self.forest = []
    def _sample(self, X, y):
        '''
        Helper function used for boostrap sampling.
        
        :param X: np.array, features
        :param y: np.array, target
        :return: tuple (sample of features, sample of target)
        '''
        n_samples = X.shape[0]
        idxs = torch.randint(0, n_samples, size=(n_samples,))
        return X[idxs], y[idxs]

    def fit(self, X, y):
        '''
        Trains a Random Forest classifier.
        
        :param X: np.array, features
        :param y: np.array, target
        :return: None
        '''
        
        # reset forest
        if len(self.forest) > 0:
            self.forest = []

        for _ in range(self.n_trees):
            try:
                X_sample, y_sample = self._sample(X, y)
                tree = DecisionTree(min_sample=self.min_sample, max_depth=self.max_depth, impurity_function=self.impurity_function)
                tree.fit(X_sample, y_sample)
                self.forest.append(tree)
            except Exception as e:
                continue
    
    def predict(self, X):
        '''
        Predicts the target for a given set of features.
        
        :param X: np.array, features
        :return: np.array, predicted target
        '''
        y_pred = []
        for tree in self.forest:
            y_pred.append(tree.predict(X))
        y_pred = torch.stack(y_pred)
        return torch.mode(y_pred, dim=0)[0]

# Titanic data

In [77]:
path_csv = "data/titanic_modified_dataset.csv"
titanic_data_df = pd.read_csv(path_csv, index_col="PassengerId")
titanic_data_df

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.2500,0,0,0
2,1,1,38.0,1,0,71.2833,1,1,1
3,3,1,26.0,0,0,7.9250,0,2,1
4,1,1,35.0,1,0,53.1000,0,1,1
5,3,0,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0,5,0
888,1,1,19.0,0,0,30.0000,0,2,1
889,3,1,28.0,1,2,23.4500,0,2,0
890,1,0,26.0,0,0,30.0000,1,0,1


In [78]:
# convert to numpy array
titanic_data_arr = titanic_data_df.to_numpy().astype(np.float32)

# shuffle data
idx = np.arange(titanic_data_arr.shape[0])
np.random.shuffle(idx)
titanic_data_arr = titanic_data_arr[idx]

In [79]:
# devide features to X, label to y 
X, y = titanic_data_arr[:, :-1], titanic_data_arr[:, -1]
print(X.shape)
print(y.shape)

(891, 8)
(891,)


In [80]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
#TEST_SIZE = 0.1
TRAIN_SAMPLES = int(TRAIN_SIZE * titanic_data_arr.shape[0])
VAL_SAMPLES = TRAIN_SAMPLES + int(VAL_SIZE * titanic_data_arr.shape[0])

X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES]
x_val, y_val = X[TRAIN_SAMPLES:VAL_SAMPLES], y[TRAIN_SAMPLES:VAL_SAMPLES]
X_test, y_test = X[VAL_SAMPLES:], y[VAL_SAMPLES:]

print(X_train.shape, x_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(623, 8) (178, 8) (90, 8)
(623,) (178,) (90,)


## Impurity function - Entropy

In [96]:
RandomForest = RandomForest(n_trees=25, min_sample=2, max_depth=10, impurity_function="entropy")
RandomForest.fit(X_train, y_train)
y_pred = RandomForest.predict(X_test)
print(y_pred)
print("Accuracy: ", accuracy_score(y_test, y_pred))

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0.,
        0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1.])
Accuracy:  0.7222222222222222


In [97]:
RandomForest.forest[0].print_tree(RandomForest.forest[0].root, list_feature=titanic_data_df.columns[:-1])

                                                                                                         ┌0.0
                                                                                             ┌Pclass, 2.0┤
                                                                                             |           |         ┌0.0
                                                                                             |           └Age, 18.0┤
                                                                                             |                     └1.0
                                                                                  ┌Fare, 6.97┤
                                                                                  |          |                               ┌0.0
                                                                                  |          |                    ┌Fare, 7.12┤
                                                                          

## Impurity function - Gini

In [103]:
RandomForest = RandomForest(n_trees=25, min_sample=2, max_depth=10, impurity_function="gini")
RandomForest.fit(X_train, y_train)
y_pred = RandomForest.predict(X_test)
print(y_pred)
print("Accuracy: ", accuracy_score(y_test, y_pred))

tensor([0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0.,
        0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1.])
Accuracy:  0.7111111111111111


In [102]:
RandomForest.forest[19].print_tree(RandomForest.forest[0].root, list_feature=titanic_data_df.columns[:-1])

                                                                ┌0.0
                                                      ┌Age, 19.0┤
                                                      |         └1.0
                                            ┌Age, 27.0┤
                                            |         └0.0
                                 ┌Fare, 6.97┤
                                 |          |         ┌0.0
                                 |          └Age, 27.0┤
                                 |                    |                                           ┌0.0
                                 |                    |                                ┌Fare, 7.22┤
                                 |                    |                                |          |         ┌1.0
                                 |                    |                                |          └Age, 28.0┤
                                 |                    |                                |      