In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

from DecisionTreeFunctions import *

In [3]:
diabetes = load_diabetes()
df = pd.DataFrame(
    data=np.c_[diabetes["data"], diabetes["target"]],
    columns=diabetes["feature_names"] + ["target"],
)

In [4]:
X = df.iloc[:, :-1].to_numpy(dtype="float32")
y = df.iloc[:, -1:].to_numpy().flatten()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model Training: Random Forest

In [8]:
rf = RandomForestRegressor(n_estimators=3, bootstrap=False)

In [9]:
rf.fit(X_train, y_train)

In [10]:
rf_estimators = rf.estimators_

In [11]:
leaf_nodes_trees = []
y_values_leaves_trees = []
X_values_leaves_trees = []
X_ids_leaves_trees = []
weights_leaves_trees = (
    []
)  # 3-Dimensional with 1. Number of tree, 2. Number of leafs per tree, 3. Length of y_train
for tree in rf_estimators:
    a, b, c, d, e = calc_obs_distribution(tree, X_train, y_train)
    leaf_nodes_trees.append(a)
    y_values_leaves_trees.append(b)
    X_values_leaves_trees.append(c)
    X_ids_leaves_trees.append(d)
    weights_leaves_trees.append(e)

In [12]:
weights, mean_preds = calc_weights_rf(
    rf, X_test, y_train, leaf_nodes_trees, weights_leaves_trees
)

In [13]:
true_preds = rf.predict(X_test)

In [14]:
np.round(mean_preds, 7) == np.round(true_preds, 7)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [15]:
def calc_weights_rf(rf, X_test, y_train, leaf_nodes_trees, weights_leaves_trees):
    """
    Method to calculate the mean prediction and weights of a random forest

    Input:
        param rf: Fully fitted random Forest
        param X_test: OOS test data
        param y_train: Data used to train the RF
        param leaf_nodes_trees: 3-Dimensional: 1. number_trees, 2. number_leaf_nodes,
                                3. array with leaf node indexes
        param weights_leaves_trees: 3-Dimensional: 1. number_trees, 2. number_leaf_nodes
                                    3. array with weights of individual leaf_nodes

    Output:
        weights_all: list of length X_test with weights used to calculate mean prediction
        mean_preds: List of mean predictions
    """
    # Calculate index of the leaf that each sample is predicted as in all trees
    X_test_id_leaves = []  # dim: num_trees x len_X_test
    for tree in rf.estimators_:  # iterate number of tree times
        X_test_id_leaves.append(tree.apply(X_test))

    weights_all = []
    mean_preds = []
    for i in range(len(X_test)):  # iterate number of X_test times
        weight_k = np.zeros(y_train.shape)
        for j in range(len(X_test_id_leaves)):  # iterate number of trees times
            X_id = X_test_id_leaves[j][i]
            index = np.where(leaf_nodes_trees[j] == X_id)[0][
                0
            ]  # Calculate index of test
            weight_k = weight_k + weights_leaves_trees[j][index]
        weight = weight_k / len(X_test_id_leaves)
        weights_all.append(weight)
        mean_preds.append(np.dot(weight, y_train))

    return weights_all, mean_preds

In [73]:
rf = RandomForestRegressor(n_estimators=3, bootstrap=True)

In [74]:
rf.fit(X_train, y_train)

In [75]:
import sklearn.ensemble._forest as forest_utils

In [76]:
n_samples = len(y_train)  # number of training samples

n_samples_bootstrap = forest_utils._get_n_samples_bootstrap(n_samples, rf.max_samples)

unsampled_indices_trees = []
sampled_indices_trees = []

for estimator in rf.estimators_:
    unsampled_indices = forest_utils._generate_unsampled_indices(
        estimator.random_state, n_samples, n_samples_bootstrap
    )
    unsampled_indices_trees.append(unsampled_indices)

    sampled_indices = forest_utils._generate_sample_indices(
        estimator.random_state, n_samples, n_samples_bootstrap
    )
    sampled_indices_trees.append(sampled_indices)

In [79]:
np.sort(sampled_indices_trees[0])

array([  0,   3,   3,   4,   5,   6,   7,   7,   8,   9,   9,  10,  11,
        11,  12,  13,  13,  14,  15,  16,  17,  17,  19,  22,  23,  26,
        27,  27,  28,  28,  32,  33,  33,  33,  34,  34,  35,  36,  36,
        36,  39,  40,  40,  41,  41,  43,  43,  43,  43,  43,  44,  44,
        45,  45,  47,  47,  49,  53,  53,  54,  55,  56,  57,  57,  58,
        60,  60,  61,  62,  63,  63,  63,  66,  68,  69,  70,  71,  71,
        71,  74,  74,  77,  77,  78,  78,  79,  79,  79,  80,  80,  82,
        84,  85,  85,  86,  87,  88,  89,  94,  95,  95,  98, 100, 100,
       102, 103, 104, 105, 106, 107, 107, 108, 109, 110, 110, 111, 112,
       113, 113, 113, 114, 114, 116, 117, 120, 120, 122, 123, 124, 124,
       126, 127, 128, 129, 131, 131, 133, 134, 135, 135, 135, 138, 138,
       139, 139, 141, 142, 143, 144, 145, 146, 148, 148, 148, 149, 151,
       151, 153, 154, 154, 154, 156, 160, 160, 161, 165, 165, 166, 167,
       168, 169, 170, 170, 171, 172, 173, 173, 174, 174, 176, 17

In [78]:
X_train[sampled_indices_trees[0]]

array([[ 0.01628068, -0.04464164, -0.046085  , ..., -0.03949338,
        -0.05140387,  0.01963284],
       [ 0.00538306,  0.05068012,  0.0347509 , ...,  0.18523444,
         0.01556846,  0.07348023],
       [ 0.03081083, -0.04464164,  0.00564998, ...,  0.1081111 ,
         0.06605066,  0.01963284],
       ...,
       [-0.04183994, -0.04464164,  0.04768465, ...,  0.1081111 ,
         0.06389027,  0.04034337],
       [-0.09632801, -0.04464164, -0.08380842, ..., -0.07639451,
        -0.06291688, -0.03421455],
       [-0.02730979,  0.05068012,  0.06061839, ..., -0.00259226,
         0.07020738,  0.13561183]], dtype=float32)

In [57]:
np.unique(sampled_indices_trees[0])

array([  0,   2,   3,   4,   5,   6,   7,   9,  10,  11,  13,  14,  16,
        20,  21,  22,  23,  25,  27,  29,  30,  31,  32,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  45,  46,  48,  52,  53,  54,  55,
        57,  58,  60,  61,  62,  64,  65,  66,  67,  68,  69,  70,  71,
        73,  74,  76,  78,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        92,  95, 102, 103, 105, 107, 109, 111, 112, 113, 114, 116, 117,
       119, 121, 122, 123, 126, 128, 129, 130, 131, 132, 133, 136, 138,
       139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 153,
       154, 155, 156, 158, 160, 161, 162, 165, 167, 168, 170, 171, 172,
       173, 174, 175, 176, 178, 179, 180, 181, 186, 188, 189, 192, 193,
       194, 195, 196, 197, 198, 199, 201, 202, 204, 205, 206, 207, 208,
       209, 210, 211, 212, 213, 214, 215, 216, 217, 220, 222, 223, 225,
       226, 231, 235, 236, 239, 240, 241, 243, 244, 245, 246, 247, 248,
       249, 251, 252, 255, 256, 259, 261, 262, 263, 264, 267, 26

In [58]:
np.unique(sampled_indices_trees[0]).shape

(232,)