In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

from DecisionTreeFunctions import *

In [3]:
diabetes = load_diabetes()
df = pd.DataFrame(
    data=np.c_[diabetes["data"], diabetes["target"]],
    columns=diabetes["feature_names"] + ["target"],
)

In [4]:
X = df.iloc[:, :-1].to_numpy(dtype="float32")
y = df.iloc[:, -1:].to_numpy().flatten()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model Training: Random Forest

In [6]:
rf = RandomForestRegressor(n_estimators=3, bootstrap=False)

In [7]:
rf.fit(X_train, y_train)

In [11]:
rf_estimators = rf.estimators_

In [12]:
leaf_nodes_trees = []
y_values_leaves_trees = []
X_values_leaves_trees = []
X_ids_leaves_trees = []
weights_leaves_trees = (
    []
)  # 3-Dimensional with 1. Number of tree, 2. Number of leafs per tree, 3. Length of y_train
for tree in rf_estimators:
    a, b, c, d, e = calc_obs_distribution(tree, X_train, y_train)
    leaf_nodes_trees.append(a)
    y_values_leaves_trees.append(b)
    X_values_leaves_trees.append(c)
    X_ids_leaves_trees.append(d)
    weights_leaves_trees.append(e)

In [13]:
weights, mean_preds = calc_weights_rf(
    rf, X_test, y_train, leaf_nodes_trees, weights_leaves_trees
)

In [14]:
true_preds = rf.predict(X_test)

In [15]:
np.round(mean_preds, 7) == np.round(true_preds, 7)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [None]:
def calc_weights_rf(rf, X_test, y_train, leaf_nodes_trees, weights_leaves_trees):
    '''
    Method to calculate the mean prediction and weights of a random forest
    
    Input: 
        param rf: Fully fitted random Forest
        param X_test: OOS test data
        param y_train: Data used to train the RF
        param leaf_nodes_trees: 3-Dimensional: 1. number_trees, 2. number_leaf_nodes, 
                                3. array with leaf node indexes
        param weights_leaves_trees: 3-Dimensional: 1. number_trees, 2. number_leaf_nodes
                                    3. array with weights of individual leaf_nodes
    
    Output: 
        weights_all: list of length X_test with weights used to calculate mean prediction
        mean_preds: List of mean predictions
    '''
    # Calculate index of the leaf that each sample is predicted as in all trees
    X_test_id_leaves = []  # dim: num_trees x len_X_test
    for tree in rf.estimators_:  # iterate number of tree times
        X_test_id_leaves.append(tree.apply(X_test))

    weights_all = []
    mean_preds = []
    for i in range(len(X_test)):  # iterate number of X_test times
        weight_k = np.zeros(y_train.shape)
        for j in range(len(X_test_id_leaves)):  # iterate number of trees times
            X_id = X_test_id_leaves[j][i]
            index = np.where(leaf_nodes_trees[j] == X_id)[0][
                0
            ]  # Calculate index of test
            weight_k = weight_k + weights_leaves_trees[j][index]
        weight = weight_k / len(X_test_id_leaves)
        weights_all.append(weight)
        mean_preds.append(np.dot(weight, y_train))

    return weights_all, mean_preds

In [140]:
import sklearn.ensemble._forest as forest_utils

In [141]:
n_samples = len(y_train)  # number of training samples

n_samples_bootstrap = forest_utils._get_n_samples_bootstrap(n_samples, rf.max_samples)

unsampled_indices_trees = []
sampled_indices_trees = []

for estimator in rf.estimators_:
    unsampled_indices = forest_utils._generate_unsampled_indices(
        estimator.random_state, n_samples, n_samples_bootstrap
    )
    unsampled_indices_trees.append(unsampled_indices)

    sampled_indices = forest_utils._generate_sample_indices(
        estimator.random_state, n_samples, n_samples_bootstrap
    )
    sampled_indices_trees.append(sampled_indices)

In [None]:
def calc_obs_distribution(tree, X_train, y_train, bootstrap = True, max_samples = None):
    """
    Function to calculate the distribution of observations in the leaf nodes of a tree-model
    Input:
        param tree: Fitted DecisionTreeRegressor
        param X_train: Training samples used for fitting
        param y_train: Training observations
    Output:
        leaf_nodes: Ordered numerical array with index of leaf nodes
        y_values_leaves: List of arrays with y_values that fall into each leaf node (same order as leaf_nodes)
        X_values_leaves: List of arrays with X_values that fall into each leaf node (same order as leaf_nodes)
        X_ids_leaves: List of arrays that determine whether a training input is in a leaf or not.
                      Length of number of leaves
        weights_leaves: List of arrays with the weights of the training obs dependent on leaf

    """
    # Calculate index of the leaf that each sample is predicted as
    leaf_id_train = tree.apply(X_train)

    # Get ids of leaf sorted
    leaf_nodes = np.unique(leaf_id_train)

    # Get list of boolean arrays with information on what observation is in which leaf
    X_ids_leaves = []
    for i in leaf_nodes:
        X_ids_leaves.append(leaf_id_train == i)

    # Get list of what observation values are in which leaf
    y_values_leaves = []
    for i in range(len(X_ids_leaves)):
        y_values_leaves.append(y_train[X_ids_leaves[i]].flatten())

    # Get list of what input values are in which leaf
    X_values_leaves = []
    for i in range(len(X_ids_leaves)):
        X_values_leaves.append(X_train[X_ids_leaves[i]])

    # Get list of weights of train obs of each leaf
    weights_leaves = []
    for i in range(len(X_ids_leaves)):
        weights_true = 1 / np.sum(X_ids_leaves[i])
        help_matrix = X_ids_leaves[i].astype(int)
        help_matrix = help_matrix.astype(float)
        help_matrix[help_matrix == 1] = weights_true
        weights_leaves.append(help_matrix)

    return leaf_nodes, y_values_leaves, X_values_leaves, X_ids_leaves, weights_leaves
