# Random Forest

Goal is to predict 5 minutes after the last timestamp for a give observation. For a given obs value, the values of y_1 and y_2 are the values for 5 minutes after the last timestamp. That is why they remain constant throughout the 'observation'. Therefore, we don't need to do any lagging of the target. The problem is that we only want 1 predicted (y_1, y_2) per observation block. It would be incorrect to just let the RF predict a (y_1, y_2) value for each row and then average across blocks, as were would be breaking the IID assumption. Instead, we have to find a way to truncate the data from an entire block into one you. While there are many ways to solve this issue, in this example we are going to start by taking the basic stats for each block. This means turning each block into a single feature vector with the mean, std, min, max values per feature, and maybe some rolling stats. In the future, we can use PCA to capture the values of these stats.

In [131]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [132]:
train_df = pd.read_csv('../train_competition_2026.csv')
train_df.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4,y_1,y_2
0,0,0,2068-09-19 23:34:11,1.38,49,7,1,3,1,0,1,105.5,95.0,67.4,36.6,23.2,33.4,107.4
1,0,0,2068-09-19 23:35:11,1.38,49,7,1,3,1,0,1,104.4,95.0,66.4,37.8,22.7,33.4,107.4
2,0,0,2068-09-19 23:36:11,1.38,49,7,1,3,1,0,1,104.0,95.0,65.2,37.0,22.1,33.4,107.4
3,0,0,2068-09-19 23:37:11,1.38,49,7,1,3,1,0,1,102.8,95.0,63.4,35.9,20.7,33.4,107.4
4,0,0,2068-09-19 23:38:11,1.38,49,7,1,3,1,0,1,101.3,95.1,59.1,34.5,18.1,33.4,107.4


In [133]:
train_df['time'] = pd.to_datetime(train_df['time'], format='%Y-%m-%d %H:%M:%S')

In [134]:
for col in train_df.columns:
    print(col, train_df[col].unique())

obs [    0     1     2 ... 17867 17868 17869]
sub_id [   0    2    3 ... 1990 1993 1995]
time <DatetimeArray>
['2068-09-19 23:34:11', '2068-09-19 23:35:11', '2068-09-19 23:36:11',
 '2068-09-19 23:37:11', '2068-09-19 23:38:11', '2068-09-19 23:39:11',
 '2068-09-19 23:40:11', '2068-09-19 23:41:11', '2068-09-19 23:42:11',
 '2068-09-19 23:43:11',
 ...
 '2127-12-11 09:04:52', '2127-12-11 09:05:52', '2127-12-11 09:06:52',
 '2127-12-11 09:07:52', '2127-12-11 09:08:52', '2127-12-11 09:09:52',
 '2127-12-11 09:34:52', '2127-12-11 09:35:52', '2127-12-11 09:36:52',
 '2127-12-11 09:37:52']
Length: 432352, dtype: datetime64[ns]
num_0 [ 1.38 -1.44 -0.19  0.06  0.44 -0.06  0.31 -0.69  0.38 -1.81  1.31 -1.19
  0.    0.12 -1.62  0.88  0.62  0.25 -0.12 -2.56  0.19 -1.06  1.25  0.5
 -0.38 -0.31  0.94  0.69 -0.81 -0.25 -0.75 -0.44 -1.25 -1.31  1.12  0.75
  1.19 -0.94 -0.5  -1.94  0.81  1.06  0.56 -1.5  -1.38  1.   -0.62 -2.12
 -1.12  1.44 -1.   -2.25 -0.56 -1.88 -2.38 -1.56 -2.94 -1.69 -2.31 -2.69
  1.5  -2

In [135]:
test_df = pd.read_csv('../test_no_outcome.csv')
test_df.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4
0,18,1,2134-04-01 22:23:14,-1.0,38,1,1,1,0,0,0,105.4,99.8,50.7,61.4,36.8
1,18,1,2134-04-01 22:24:14,-1.0,38,1,1,1,0,0,0,105.4,99.4,49.4,61.1,36.2
2,18,1,2134-04-01 22:25:14,-1.0,38,1,1,1,0,0,0,104.6,99.0,49.7,61.4,36.6
3,18,1,2134-04-01 22:26:14,-1.0,38,1,1,1,0,0,0,104.5,99.6,51.7,61.8,37.2
4,18,1,2134-04-01 22:27:14,-1.0,38,1,1,1,0,0,0,104.6,99.5,52.5,61.9,37.5


## RF using Aggregation Stats

In [136]:
def collapse_obs(df, feature_cols, aggs):
    X = (
        df
        .groupby(["obs"])[feature_cols]
        .agg(aggs)
    )
    X.columns = [
        f"{feat}_{agg}"
        for feat, agg in X.columns
    ]
    X = X.reset_index()
    return X


In [137]:
def collapse_target(df):
    y = (
        df
        .groupby(["obs"])[["y_1", "y_2"]]
        .first()
        .reset_index(drop=True)
    )
    return y

In [138]:
feature_cols = [
    c for c in train_df.columns
    if c not in ["obs", "sub_id", "time", "y_1", "y_2"]
]

aggs = ["min", "max", "mean", "std"]

X_train = collapse_obs(train_df, feature_cols, aggs).drop(columns=['obs'])
y_train = collapse_target(train_df)

X_test = collapse_obs(test_df, feature_cols, aggs)
test_index = X_test[["obs"]].copy()
X_test = X_test.drop(columns=['obs'])

In [139]:
X_train.head()

Unnamed: 0,num_0_min,num_0_max,num_0_mean,num_0_std,num_1_min,num_1_max,num_1_mean,num_1_std,num_2_min,num_2_max,num_2_mean,num_2_std,cat_0_min,cat_0_max,cat_0_mean,cat_0_std,cat_1_min,cat_1_max,cat_1_mean,cat_1_std,cat_2_min,cat_2_max,cat_2_mean,cat_2_std,cat_3_min,cat_3_max,cat_3_mean,cat_3_std,cat_4_min,cat_4_max,cat_4_mean,cat_4_std,t_0_min,t_0_max,t_0_mean,t_0_std,t_1_min,t_1_max,t_1_mean,t_1_std,t_2_min,t_2_max,t_2_mean,t_2_std,t_3_min,t_3_max,t_3_mean,t_3_std,t_4_min,t_4_max,t_4_mean,t_4_std
0,1.38,1.38,1.38,0.0,49,49,49.0,0.0,7,7,7.0,0.0,1,1,1.0,0.0,3,3,3.0,0.0,1,1,1.0,0.0,0,0,0.0,0.0,1,1,1.0,0.0,94.5,110.9,104.083333,4.742477,0.0,95.4,69.22,42.457146,41.4,76.6,62.936667,8.65637,31.0,40.4,36.556667,1.990959,14.3,28.8,21.88,3.265113
1,1.38,1.38,1.38,0.0,49,49,49.0,0.0,7,7,7.0,0.0,1,1,1.0,0.0,3,3,3.0,0.0,1,1,1.0,0.0,0,0,0.0,0.0,1,1,1.0,0.0,93.8,104.1,99.096667,2.738296,89.5,93.9,92.37,1.068725,45.1,62.9,53.19,4.679106,36.7,43.9,39.72,1.888513,19.8,30.3,24.026667,2.768534
2,1.38,1.38,1.38,0.0,49,49,49.0,0.0,7,7,7.0,0.0,1,1,1.0,0.0,3,3,3.0,0.0,1,1,1.0,0.0,0,0,0.0,0.0,1,1,1.0,0.0,87.3,95.3,91.3,2.082273,80.7,95.5,90.49,3.3377,46.1,63.4,55.796667,4.048285,34.9,42.2,37.31,2.041526,17.5,29.0,21.453333,3.00594
3,1.38,1.38,1.38,0.0,49,49,49.0,0.0,7,7,7.0,0.0,1,1,1.0,0.0,3,3,3.0,0.0,1,1,1.0,0.0,0,0,0.0,0.0,1,1,1.0,0.0,84.2,87.9,85.996667,0.889589,89.7,98.5,94.253333,1.637225,56.3,78.5,63.806667,4.922708,34.1,39.4,35.8,1.079272,17.4,27.8,20.696667,1.928459
4,1.38,1.38,1.38,0.0,49,49,49.0,0.0,7,7,7.0,0.0,1,1,1.0,0.0,3,3,3.0,0.0,1,1,1.0,0.0,0,0,0.0,0.0,1,1,1.0,0.0,84.1,91.9,86.343333,2.320179,89.6,93.4,91.156667,1.160762,60.9,75.5,69.13,3.208625,32.5,38.8,35.613333,1.162854,17.8,25.0,21.5,1.469459


In [140]:
assert X_train.shape[0] == y_train.shape[0]
assert X_train.shape[1] == X_test.shape[1]

In [141]:
X_train.isna().sum()

num_0_min     0
num_0_max     0
num_0_mean    0
num_0_std     0
num_1_min     0
num_1_max     0
num_1_mean    0
num_1_std     0
num_2_min     0
num_2_max     0
num_2_mean    0
num_2_std     0
cat_0_min     0
cat_0_max     0
cat_0_mean    0
cat_0_std     0
cat_1_min     0
cat_1_max     0
cat_1_mean    0
cat_1_std     0
cat_2_min     0
cat_2_max     0
cat_2_mean    0
cat_2_std     0
cat_3_min     0
cat_3_max     0
cat_3_mean    0
cat_3_std     0
cat_4_min     0
cat_4_max     0
cat_4_mean    0
cat_4_std     0
t_0_min       0
t_0_max       0
t_0_mean      0
t_0_std       0
t_1_min       0
t_1_max       0
t_1_mean      0
t_1_std       0
t_2_min       0
t_2_max       0
t_2_mean      0
t_2_std       0
t_3_min       0
t_3_max       0
t_3_mean      0
t_3_std       0
t_4_min       0
t_4_max       0
t_4_mean      0
t_4_std       0
dtype: int64

There are some useless features here, like the binary minimums and maximums here, but RF doesn't care about multicollinearity, so we will leave them.

In [142]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=500,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [143]:
y_test_pred = rf.predict(X_test)

pred_df = pd.concat(
    [
        test_index.reset_index(drop=True),
        pd.DataFrame(y_test_pred, columns=["y_1", "y_2"])
    ],
    axis=1
)
pred_df

Unnamed: 0,obs,y_1,y_2
0,18,40.472578,106.166057
1,19,34.206554,102.208864
2,20,36.190550,96.878293
3,21,36.184273,96.721368
4,22,39.660616,95.648616
...,...,...,...
3445,17850,56.559727,89.089651
3446,17862,55.612713,109.868092
3447,17863,52.249754,112.936708
3448,17864,45.810752,102.452455


In [None]:
# pred_df.to_csv("rf_agg_submission_1.csv", index=False)