In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

from DecisionTreeFunctions import *

In [3]:
diabetes = load_diabetes()
df = pd.DataFrame(
    data=np.c_[diabetes["data"], diabetes["target"]],
    columns=diabetes["feature_names"] + ["target"],
)

In [4]:
X = df.iloc[:, :-1].to_numpy(dtype="float32")
y = df.iloc[:, -1:].to_numpy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### 1. Building tree

In [6]:
tree1 = DecisionTreeRegressor()

In [7]:
tree1.fit(X_train, y_train)

#### 2. Calculate Observation Distribution

In [8]:
(
    leaf_nodes,
    y_values_leaves,
    X_values_leaves,
    X_ids_leaves,
    weights_leaves,
) = calc_obs_distribution(tree1, X_train, y_train)

In [9]:
means, weights = calc_mean_observation_tree(
    tree1, X_test, y_train, leaf_nodes, weights_leaves
)

In [16]:
len(weights_leaves)

345

In [18]:
leaf_nodes

array([  8,  12,  14,  15,  16,  18,  19,  21,  26,  27,  29,  30,  33,
        34,  36,  37,  38,  43,  47,  48,  49,  51,  53,  54,  56,  58,
        59,  61,  62,  64,  65,  70,  71,  72,  78,  79,  80,  81,  84,
        85,  86,  88,  91,  92,  94,  96,  97, 103, 104, 106, 110, 112,
       113, 115, 116, 119, 120, 122, 124, 125, 128, 129, 131, 132, 133,
       135, 137, 138, 143, 147, 148, 149, 151, 152, 154, 157, 158, 160,
       161, 164, 167, 168, 169, 171, 172, 181, 182, 184, 186, 187, 190,
       191, 192, 194, 195, 198, 199, 203, 205, 206, 207, 209, 210, 211,
       215, 216, 220, 221, 225, 226, 228, 229, 231, 235, 236, 237, 238,
       240, 241, 244, 245, 246, 249, 251, 252, 256, 260, 261, 262, 264,
       266, 267, 268, 271, 273, 274, 275, 276, 279, 280, 290, 291, 292,
       296, 297, 299, 301, 302, 306, 308, 309, 310, 311, 313, 315, 317,
       318, 319, 320, 323, 324, 327, 330, 332, 333, 335, 337, 338, 339,
       340, 342, 343, 352, 353, 355, 356, 357, 362, 363, 365, 36

In [17]:
weights_leaves[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [10]:
y_train.shape

(353, 1)

In [11]:
test = tree1.predict(X_test)

In [12]:
test.shape

(89,)

In [13]:
means == test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [350]:
def calc_empirical_dist_tree(tree, X_test, y_train, leaf_nodes, weights_leaves):
    #  Calculate index of the leaf that each sample is predicted as
    X_test_id_leaves = tree.apply(X_test)
    
    empirical_dist = []
    for i in range(len(X_test_id_leaves)):
        X_id = X_test_id_leaves[i]
        index = np.where(leaf_nodes == X_id)[0][0]  # Calculate index of test
        
        for j in range(len(y_train)):
            dist = 
            

In [351]:
def calc_obs_weight(X_test, tree, leaf_nodes, X_ids_leaves):
    """
    Calculates the weights w_i(x,θ) for each test value
    Input:
        param X_test: Array of test values
        param tree: Fitted DecisionTreeRegressor
        param leaf_nodes: Ordered numerical array with index of leaf nodes
        param X_ids_leaves: List of arrays that determine whether a training input is in a leaf or not

    Output:
        weights: List of arrays with the weights of the training observations
    """

    #  Calculate index of the leaf that each sample is predicted as
    X_test_id_leaves = tree.apply(X_test)

    weights = []
    for i in range(len(X_test_id_leaves)):
        X_id = X_test_id_leaves[i]
        index = np.where(leaf_nodes == X_id)[0][0]
        weights_true = 1 / np.sum(X_ids_leaves[index])
        help_matrix = X_ids_leaves[index].astype(int)
        help_matrix = help_matrix.astype(float)
        help_matrix[help_matrix == 1] = weights_true
        weights.append(help_matrix)

    return weights

In [352]:
weights = calc_obs_weight(X_test[0:3], tree1, leaf_nodes, X_ids_leaves)