# XGBoost 

In [66]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score

In [67]:
df_train = pd.read_csv("src_files/train_competition_2026.csv")
df_train.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4,y_1,y_2
0,0,0,2068-09-19 23:34:11,1.38,49,7,1,3,1,0,1,105.5,95.0,67.4,36.6,23.2,33.4,107.4
1,0,0,2068-09-19 23:35:11,1.38,49,7,1,3,1,0,1,104.4,95.0,66.4,37.8,22.7,33.4,107.4
2,0,0,2068-09-19 23:36:11,1.38,49,7,1,3,1,0,1,104.0,95.0,65.2,37.0,22.1,33.4,107.4
3,0,0,2068-09-19 23:37:11,1.38,49,7,1,3,1,0,1,102.8,95.0,63.4,35.9,20.7,33.4,107.4
4,0,0,2068-09-19 23:38:11,1.38,49,7,1,3,1,0,1,101.3,95.1,59.1,34.5,18.1,33.4,107.4


In [68]:
df_test = pd.read_csv("src_files/test_no_outcome.csv")
df_test.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4
0,18,1,2134-04-01 22:23:14,-1.0,38,1,1,1,0,0,0,105.4,99.8,50.7,61.4,36.8
1,18,1,2134-04-01 22:24:14,-1.0,38,1,1,1,0,0,0,105.4,99.4,49.4,61.1,36.2
2,18,1,2134-04-01 22:25:14,-1.0,38,1,1,1,0,0,0,104.6,99.0,49.7,61.4,36.6
3,18,1,2134-04-01 22:26:14,-1.0,38,1,1,1,0,0,0,104.5,99.6,51.7,61.8,37.2
4,18,1,2134-04-01 22:27:14,-1.0,38,1,1,1,0,0,0,104.6,99.5,52.5,61.9,37.5


## 1. Verify Target is Constant per Observation

In [69]:
target_std = df_train.groupby('obs')[['y_1', 'y_2']].std()
print("Max std of y_1 within any obs:", target_std['y_1'].max())
print("Max std of y_2 within any obs:", target_std['y_2'].max())

Max std of y_1 within any obs: 0.0
Max std of y_2 within any obs: 0.0


## 2. Collapse Observations into Aggregate Features

In [70]:
feature_cols = [c for c in df_train.columns if c not in ['obs', 'sub_id', 'time', 'y_1', 'y_2']]
aggs = ['mean', 'std', 'min', 'max', 'last']

print(f"Feature columns to aggregate: {feature_cols}")
print(f"Aggregations: {aggs}")

Feature columns to aggregate: ['num_0', 'num_1', 'num_2', 'cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 't_0', 't_1', 't_2', 't_3', 't_4']
Aggregations: ['mean', 'std', 'min', 'max', 'last']


In [71]:
def collapse_obs(df, feature_cols, aggs):
    grouped = df.groupby('obs')[feature_cols].agg(aggs)
    grouped.columns = [f'{feat}_{agg}' for feat, agg in grouped.columns]

    grouped['obs_length'] = df.groupby('obs').size()
    
    return grouped.reset_index()

In [72]:
X_train_df = collapse_obs(df_train, feature_cols, aggs)
X_test_df = collapse_obs(df_test, feature_cols, aggs)

print(f"Train: number of observations: {X_train_df.shape[0]} with {X_train_df.shape[1]-1} features")
print(f"Test:  number of observations: {X_test_df.shape[0]} with {X_test_df.shape[1]-1} features")
X_train_df.head()

Train: number of observations: 14420 with 66 features
Test:  number of observations: 3450 with 66 features


Unnamed: 0,obs,num_0_mean,num_0_std,num_0_min,num_0_max,num_0_last,num_1_mean,num_1_std,num_1_min,num_1_max,...,t_3_std,t_3_min,t_3_max,t_3_last,t_4_mean,t_4_std,t_4_min,t_4_max,t_4_last,obs_length
0,0,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.990959,31.0,40.4,34.3,21.88,3.265113,14.3,28.8,17.7,30
1,1,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.888513,36.7,43.9,38.0,24.026667,2.768534,19.8,30.3,22.0,30
2,2,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,2.041526,34.9,42.2,36.1,21.453333,3.00594,17.5,29.0,19.1,30
3,3,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.079272,34.1,39.4,36.4,20.696667,1.928459,17.4,27.8,20.8,30
4,4,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.162854,32.5,38.8,32.5,21.5,1.469459,17.8,25.0,17.8,30


## 3. Extract Targets

In [73]:
y_train = df_train.groupby('obs')[['y_1', 'y_2']].first().reset_index(drop=True)

obs_to_patient = df_train.groupby('obs')['sub_id'].first().reset_index(drop=True)

print(f"Data frame shape: {y_train.shape}")
print(f"Number of unique patients: {obs_to_patient.nunique()}")
y_train.head()

Data frame shape: (14420, 2)
Number of unique patients: 1596


Unnamed: 0,y_1,y_2
0,33.4,107.4
1,25.76,92.76
2,19.94,85.56
3,14.14,83.28
4,24.58,83.92


## 4. Prepare Feature Matrices

In [74]:
test_obs = X_test_df['obs'].copy()
X_train = X_train_df.drop(columns=['obs'])
X_test = X_test_df.drop(columns=['obs'])

## 5. CV (Grouping KFolds by Patients)

In [75]:
gkf = GroupKFold(n_splits=5)

best_model_y1 = None
best_mae_y1 = float('inf')
scores_y1 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_1'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_1'].iloc[tr_idx], y_train['y_1'].iloc[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        early_stopping_rounds=50,
        random_state=42,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )

    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    scores_y1.append(fold_mae)
    print(f"Fold {i+1} MAE: {fold_mae} (best iteration: {model.best_iteration})")

    if fold_mae < best_mae_y1:
        best_mae_y1 = fold_mae
        best_model_y1 = model

print(f"Mean CV MAE for y_1: {np.mean(scores_y1)}")

Fold 1 MAE: 5.000589832869383 (best iteration: 152)
Fold 2 MAE: 5.087428583086942 (best iteration: 102)
Fold 3 MAE: 5.020408759308919 (best iteration: 184)
Fold 4 MAE: 5.2225262907242485 (best iteration: 150)
Fold 5 MAE: 5.05684298258714 (best iteration: 126)
Mean CV MAE for y_1: 5.077559289715327


## 6. Train XGBoost for `y_2`

In [76]:
best_model_y2 = None
best_mae_y2 = float('inf')
scores_y2 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_2'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_2'].iloc[tr_idx], y_train['y_2'].iloc[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        early_stopping_rounds=50,
        random_state=42,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )

    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    scores_y2.append(fold_mae)
    print(f"Fold {i+1} MAE: {fold_mae} (best iteration: {model.best_iteration})")

    if fold_mae < best_mae_y2:
        best_mae_y2 = fold_mae
        best_model_y2 = model

print(f"Mean CV MAE for y_2: {np.mean(scores_y2)}")

Fold 1 MAE: 3.583557817456461 (best iteration: 126)
Fold 2 MAE: 4.0042253870441575 (best iteration: 151)
Fold 3 MAE: 3.544332193223845 (best iteration: 150)
Fold 4 MAE: 3.3386942261631054 (best iteration: 153)
Fold 5 MAE: 3.3534094847257188 (best iteration: 235)
Mean CV MAE for y_2: 3.5648438217226577


## 7. Predict on Test Set & Build Submission

In [79]:
y1_preds = best_model_y1.predict(X_test)
y2_preds = best_model_y2.predict(X_test)

submission = pd.DataFrame({
    'obs': test_obs,
    'y_1': y1_preds,
    'y_2': y2_preds,
})

submission.head()

Unnamed: 0,obs,y_1,y_2
0,18,41.642944,105.613304
1,19,32.774059,101.010246
2,20,37.977589,96.472939
3,21,34.917747,95.957054
4,22,39.226749,95.005875


In [78]:
submission.to_csv('james_test_submission.csv', index=False)