# XGBoost 

In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score

In [28]:
df_train = pd.read_csv("src_files/train_competition_2026.csv")
df_train.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4,y_1,y_2
0,0,0,2068-09-19 23:34:11,1.38,49,7,1,3,1,0,1,105.5,95.0,67.4,36.6,23.2,33.4,107.4
1,0,0,2068-09-19 23:35:11,1.38,49,7,1,3,1,0,1,104.4,95.0,66.4,37.8,22.7,33.4,107.4
2,0,0,2068-09-19 23:36:11,1.38,49,7,1,3,1,0,1,104.0,95.0,65.2,37.0,22.1,33.4,107.4
3,0,0,2068-09-19 23:37:11,1.38,49,7,1,3,1,0,1,102.8,95.0,63.4,35.9,20.7,33.4,107.4
4,0,0,2068-09-19 23:38:11,1.38,49,7,1,3,1,0,1,101.3,95.1,59.1,34.5,18.1,33.4,107.4


In [29]:
df_test = pd.read_csv("src_files/test_no_outcome.csv")
df_test.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4
0,18,1,2134-04-01 22:23:14,-1.0,38,1,1,1,0,0,0,105.4,99.8,50.7,61.4,36.8
1,18,1,2134-04-01 22:24:14,-1.0,38,1,1,1,0,0,0,105.4,99.4,49.4,61.1,36.2
2,18,1,2134-04-01 22:25:14,-1.0,38,1,1,1,0,0,0,104.6,99.0,49.7,61.4,36.6
3,18,1,2134-04-01 22:26:14,-1.0,38,1,1,1,0,0,0,104.5,99.6,51.7,61.8,37.2
4,18,1,2134-04-01 22:27:14,-1.0,38,1,1,1,0,0,0,104.6,99.5,52.5,61.9,37.5


## 1. Verify Target is Constant per Observation

In [30]:
target_std = df_train.groupby('obs')[['y_1', 'y_2']].std()
print("Max std of y_1 within any obs:", target_std['y_1'].max())
print("Max std of y_2 within any obs:", target_std['y_2'].max())

Max std of y_1 within any obs: 0.0
Max std of y_2 within any obs: 0.0


## 2. Collapse Observations into Aggregate Features

In [31]:
feature_cols = [c for c in df_train.columns if c not in ['obs', 'sub_id', 'time', 'y_1', 'y_2']]
aggs = ['mean', 'std', 'min', 'max', 'last']

print(f"Feature columns to aggregate: {feature_cols}")
print(f"Aggregations: {aggs}")

Feature columns to aggregate: ['num_0', 'num_1', 'num_2', 'cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 't_0', 't_1', 't_2', 't_3', 't_4']
Aggregations: ['mean', 'std', 'min', 'max', 'last']


In [None]:
def collapse_obs(df, feature_cols, aggs):
    obs_groups = df.groupby('obs')

    num_cols   = [c for c in feature_cols if c.startswith('num_')]
    t_cols     = [c for c in feature_cols if c.startswith('t_')]
    other_cols = [c for c in feature_cols if c not in num_cols + t_cols]

    num_agg = obs_groups[num_cols].mean()
    num_agg.columns = [f'{c}_mean' for c in num_cols]

    full_agg = obs_groups[t_cols + other_cols].agg(aggs)
    full_agg.columns = [f'{feat}_{agg}' for feat, agg in full_agg.columns]

    temporal = {}
    for col in t_cols:
        s = obs_groups[col]
        temporal[f'{col}_delta'] = s.last() - s.first()
        temporal[f'{col}_range'] = s.max() - s.min()
        temporal[f'{col}_slope'] = s.apply(
            lambda x: np.polyfit(np.arange(len(x)), x.values, 1)[0] if len(x) > 1 else 0.0
        )
    temporal_df = pd.DataFrame(temporal)

    obs_length = obs_groups.size().rename('obs_length')

    return pd.concat([num_agg, full_agg, temporal_df, obs_length], axis=1).reset_index()

In [33]:
X_train_df = collapse_obs(df_train, feature_cols, aggs)
X_test_df = collapse_obs(df_test, feature_cols, aggs)

print(f"Train: number of observations: {X_train_df.shape[0]} with {X_train_df.shape[1]-1} features")
print(f"Test:  number of observations: {X_test_df.shape[0]} with {X_test_df.shape[1]-1} features")
X_train_df.head()

Train: number of observations: 14420 with 69 features
Test:  number of observations: 3450 with 69 features


Unnamed: 0,obs,num_0_mean,num_1_mean,num_2_mean,t_0_mean,t_0_std,t_0_min,t_0_max,t_0_last,t_1_mean,...,t_2_delta,t_2_range,t_2_slope,t_3_delta,t_3_range,t_3_slope,t_4_delta,t_4_range,t_4_slope,obs_length
0,0,1.38,49.0,7.0,104.083333,4.742477,94.5,110.9,94.5,69.22,...,-7.6,35.2,0.168076,-2.3,9.4,0.013415,-5.5,14.5,0.032392,30
1,1,1.38,49.0,7.0,99.096667,2.738296,93.8,104.1,95.3,92.37,...,6.7,17.8,0.143515,0.3,7.2,-0.036129,2.2,10.5,0.011791,30
2,2,1.38,49.0,7.0,91.3,2.082273,87.3,95.3,91.2,90.49,...,-7.1,17.3,-0.356685,-3.3,7.3,-0.156196,-5.3,11.5,-0.259043,30
3,3,1.38,49.0,7.0,85.996667,0.889589,84.2,87.9,87.1,94.253333,...,0.2,22.2,0.064961,1.6,5.3,0.054683,1.5,10.4,0.078109,30
4,4,1.38,49.0,7.0,86.343333,2.320179,84.1,91.9,86.1,91.156667,...,1.2,14.6,0.256796,-3.8,6.3,-0.05188,-4.0,7.2,-0.014194,30


## 3. Extract Targets

In [34]:
y_train = df_train.groupby('obs')[['y_1', 'y_2']].first().reset_index(drop=True)

obs_to_patient = df_train.groupby('obs')['sub_id'].first().reset_index(drop=True)

print(f"Data frame shape: {y_train.shape}")
print(f"Number of unique patients: {obs_to_patient.nunique()}")
y_train.head()

Data frame shape: (14420, 2)
Number of unique patients: 1596


Unnamed: 0,y_1,y_2
0,33.4,107.4
1,25.76,92.76
2,19.94,85.56
3,14.14,83.28
4,24.58,83.92


## 4. Prepare Feature Matrices

In [35]:
test_obs = X_test_df['obs'].copy()
X_train = X_train_df.drop(columns=['obs'])
X_test = X_test_df.drop(columns=['obs'])

## 5. CV (Grouping KFolds by Patients)

In [36]:
gkf = GroupKFold(n_splits=5)

fold_models_y1 = []
scores_y1 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_1'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_1'].iloc[tr_idx], y_train['y_1'].iloc[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        early_stopping_rounds=50,
        random_state=42,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )

    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    scores_y1.append(fold_mae)
    fold_models_y1.append(model)
    print(f"Fold {i+1} MAE: {fold_mae:.4f} (best iteration: {model.best_iteration})")

print(f"\nMean CV MAE for y_1: {np.mean(scores_y1):.4f}")

Fold 1 MAE: 4.9777 (best iteration: 150)
Fold 2 MAE: 5.1010 (best iteration: 108)
Fold 3 MAE: 5.0541 (best iteration: 151)
Fold 4 MAE: 5.1730 (best iteration: 272)
Fold 5 MAE: 4.9982 (best iteration: 136)

Mean CV MAE for y_1: 5.0608


## 6. Train XGBoost for `y_2`

In [37]:
fold_models_y2 = []
scores_y2 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_2'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_2'].iloc[tr_idx], y_train['y_2'].iloc[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        early_stopping_rounds=50,
        random_state=42,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )

    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    scores_y2.append(fold_mae)
    fold_models_y2.append(model)
    print(f"Fold {i+1} MAE: {fold_mae:.4f} (best iteration: {model.best_iteration})")

print(f"\nMean CV MAE for y_2: {np.mean(scores_y2):.4f}")

Fold 1 MAE: 3.5259 (best iteration: 164)
Fold 2 MAE: 3.9395 (best iteration: 167)
Fold 3 MAE: 3.5071 (best iteration: 167)
Fold 4 MAE: 3.2868 (best iteration: 221)
Fold 5 MAE: 3.2879 (best iteration: 192)

Mean CV MAE for y_2: 3.5094


## 7. Predict on Test Set & Build Submission

In [38]:
y1_preds = np.mean([m.predict(X_test) for m in fold_models_y1], axis=0)
y2_preds = np.mean([m.predict(X_test) for m in fold_models_y2], axis=0)

submission = pd.DataFrame({
    'obs': test_obs,
    'y_1': y1_preds,
    'y_2': y2_preds,
})

submission.head()

Unnamed: 0,obs,y_1,y_2
0,18,41.211327,105.558243
1,19,33.266994,100.155525
2,20,36.631569,96.687119
3,21,35.407043,95.851074
4,22,36.215164,95.319443


In [39]:
submission.to_csv('james_test_submission.csv', index=False)