In [1]:
# Dataset
from sklearn.datasets import load_diabetes

# Basics
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Model
from sklearn.ensemble import RandomForestRegressor

# Helpful:
from sklearn.model_selection import train_test_split

# Path setup
import sys

sys.path.append(r"/home/dchen/Random_Forest_Weights/")
# my functions:
from src.methods.calc_mean import *
from src.methods.calc_weights import *
from src.methods.calc_dist import *

### 1. Loading Data & Train_test_split

In [2]:
diabetes = load_diabetes()
df_diabetes = pd.DataFrame(
    data=np.c_[diabetes["data"], diabetes["target"]],
    columns=diabetes["feature_names"] + ["target"],
)

In [3]:
X = df_diabetes.iloc[:, :-1].to_numpy(dtype="float32")
y = df_diabetes.iloc[:, -1:].to_numpy().flatten()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 2. Random Forest

In [5]:
bootstrap = True
max_sample = 0.7

In [6]:
rf = RandomForestRegressor(bootstrap=bootstrap, max_samples=max_sample)

In [7]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_samples=0.7)

### 3. Calculate Weights Random Forest

In [8]:
rf_weights = calc_weights_rf(rf, X_train, X_test, bootstrap, max_sample)

100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 320.67it/s]


### 4. Calc Mean with weights and normal:

In [9]:
rf_mean_weights = calc_mean_rf(rf_weights, y_train)
rf_mean_normal = rf.predict(X_test)

In [10]:
# Are the two the same?
sum(np.round(rf_mean_weights, 5) == np.round(rf_mean_normal, 5))

89

### 5. Calc Quantile Random Forest

In [11]:
rf_cdfs = calc_dist_rf(rf_weights, y_train)

In [12]:
rf_median = calc_quantile_rf(rf_cdfs, 0.5, y_train)

In [14]:
rf_median[:10]

[167.00000000000102,
 84.99999999999976,
 104.0,
 63.265625,
 129.00000000000057,
 94.125,
 72.00000000000095,
 138.5625,
 79.99999999999987,
 200.28125]