#Setup

In [1]:
# Reproducibility
import numpy as np
random_state = 777
np.random.seed(random_state)

# Data description
n_subjects = 100
n_timepoints = 5 #still called time points, even if here the time has no effect

# RF hyperparameters
n_estimators = 100
max_depth = 10
bootstrap_fraction = 0.6 #how many datapoint (or subjects) are given to each tree

# Generate longitudinal dataset

Two features (x1, x2), one target variable (y), `n_timepoints` repetitions for each sample.

In [2]:
import pandas as pd

data = []

for subject_id in range(n_subjects):
    X1 = np.random.normal(5, 1)
    X2 = np.random.poisson(5)
    for time in range(n_timepoints):
        y = X1 + X2 + np.random.normal(0, 0.5)  # target depends on features plus noise, not time
        data.append({
            'subject_id': f'subject_{subject_id}',
            'X1': X1,
            'X2': X2,
            'y': y
        })

df = pd.DataFrame(data)

print(df)

     subject_id        X1  X2          y
0     subject_0  4.531791   6  10.120379
1     subject_0  4.531791   6  10.169630
2     subject_0  4.531791   6  10.660070
3     subject_0  4.531791   6  10.612921
4     subject_0  4.531791   6  10.151270
..          ...       ...  ..        ...
495  subject_99  5.712344  10  16.069661
496  subject_99  5.712344  10  15.353078
497  subject_99  5.712344  10  15.782522
498  subject_99  5.712344  10  15.213245
499  subject_99  5.712344  10  15.879670

[500 rows x 4 columns]


# Split train/validation sets

In [3]:
# The list of subjects, to be shuffled
subjects = df['subject_id'].unique()
np.random.shuffle(subjects)

# Separating the subjects in two lists with a classic 80/20 split
train_subjects = subjects[:80]   # 80 subjects for training
val_subjects = subjects[80:]     # 20 subjects for validation

# Separating the actual data, so that info on the train subjects doesn't
# spill into the validation set
train_df = df[df['subject_id'].isin(train_subjects)]
val_df = df[df['subject_id'].isin(val_subjects)]

# For easier interface, we also further split the data in X and y
X_train = train_df[['X1', 'X2']]
y_train = train_df['y']
subject_train = train_df['subject_id']

X_val = val_df[['X1', 'X2']]
y_val = val_df['y']
subject_test = val_df['subject_id']

# Standard Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate the model
rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, max_samples=bootstrap_fraction, max_depth=max_depth)

# Training
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_val)

0    0.201619
1    0.798381
dtype: float64


# RF++

We'll need to implement the custom bootstrap described in the course: a number of subjects is selected and all their data points are given to a tree. We create a custom class `RandomForestPlusPlus` for the task.

In [5]:
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

class RandomForestPlusPlus:
    def __init__(self, n_estimators, max_depth, bootstrap_fraction, random_state):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.bootstrap_fraction = bootstrap_fraction
        self.random_state = np.random.RandomState(random_state)
        self.trees = []

    def fit(self, X, y, subject_ids):
        self.trees = []
        Xy = X.copy()
        Xy['y'] = y
        Xy['subject_id'] = subject_ids

        unique_subjects = subject_ids.unique()

        for _ in range(self.n_estimators):
            target_size = int(self.bootstrap_fraction * len(unique_subjects))
            sampled_subjects = self.random_state.choice(unique_subjects, size=target_size, replace=False)
            sample = Xy[Xy['subject_id'].isin(sampled_subjects)]
            X_sample = sample.drop(columns=['y', 'subject_id'])
            y_sample = sample['y']

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        preds = np.zeros((len(self.trees), len(X)))
        for i, tree in enumerate(self.trees):
            preds[i] = tree.predict(X)
        return preds.mean(axis=0)

In [6]:
# Train RF++, get the predictions
rfpp = RandomForestPlusPlus(n_estimators=n_estimators, max_depth=max_depth, bootstrap_fraction=bootstrap_fraction, random_state=random_state)
rfpp.fit(X_train, y_train, subject_train)
y_pred_rfpp = rfpp.predict(X_val)

# Comparing the two models

In [7]:
print(f"Standard RF R2 score: {r2_score(y_val, y_pred_rf):.4f}")
print(f"RF++ R2 score: {r2_score(y_val, y_pred_rfpp):.4f}")

Standard RF R2 score: 0.9285
RF++ R2 score: 0.9333
