# XGBoost 

In [38]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [39]:
df_train = pd.read_csv("./train_competition_2026.csv")
df_train.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4,y_1,y_2
0,0,0,2068-09-19 23:34:11,1.38,49,7,1,3,1,0,1,105.5,95.0,67.4,36.6,23.2,33.4,107.4
1,0,0,2068-09-19 23:35:11,1.38,49,7,1,3,1,0,1,104.4,95.0,66.4,37.8,22.7,33.4,107.4
2,0,0,2068-09-19 23:36:11,1.38,49,7,1,3,1,0,1,104.0,95.0,65.2,37.0,22.1,33.4,107.4
3,0,0,2068-09-19 23:37:11,1.38,49,7,1,3,1,0,1,102.8,95.0,63.4,35.9,20.7,33.4,107.4
4,0,0,2068-09-19 23:38:11,1.38,49,7,1,3,1,0,1,101.3,95.1,59.1,34.5,18.1,33.4,107.4


In [40]:
df_test = pd.read_csv("./test_no_outcome.csv")
df_test.head()

Unnamed: 0,obs,sub_id,time,num_0,num_1,num_2,cat_0,cat_1,cat_2,cat_3,cat_4,t_0,t_1,t_2,t_3,t_4
0,18,1,2134-04-01 22:23:14,-1.0,38,1,1,1,0,0,0,105.4,99.8,50.7,61.4,36.8
1,18,1,2134-04-01 22:24:14,-1.0,38,1,1,1,0,0,0,105.4,99.4,49.4,61.1,36.2
2,18,1,2134-04-01 22:25:14,-1.0,38,1,1,1,0,0,0,104.6,99.0,49.7,61.4,36.6
3,18,1,2134-04-01 22:26:14,-1.0,38,1,1,1,0,0,0,104.5,99.6,51.7,61.8,37.2
4,18,1,2134-04-01 22:27:14,-1.0,38,1,1,1,0,0,0,104.6,99.5,52.5,61.9,37.5


## 1. Verify Target is Constant per Observation

In [41]:
target_std = df_train.groupby('obs')[['y_1', 'y_2']].std()
print("Max std of y_1 within any obs:", target_std['y_1'].max())
print("Max std of y_2 within any obs:", target_std['y_2'].max())

Max std of y_1 within any obs: 0.0
Max std of y_2 within any obs: 0.0


## 2. Collapse Observations into Aggregate Features

In [42]:
feature_cols = [c for c in df_train.columns if c not in ['obs', 'sub_id', 'time', 'y_1', 'y_2']]
aggs = ['mean', 'std', 'min', 'max', 'last']

print(f"Feature columns to aggregate: {feature_cols}")
print(f"Aggregations: {aggs}")

Feature columns to aggregate: ['num_0', 'num_1', 'num_2', 'cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 't_0', 't_1', 't_2', 't_3', 't_4']
Aggregations: ['mean', 'std', 'min', 'max', 'last']


In [43]:
def collapse_obs(df, feature_cols, aggs):
    grouped = df.groupby('obs')[feature_cols].agg(aggs)
    grouped.columns = [f'{feat}_{agg}' for feat, agg in grouped.columns]

    grouped['obs_length'] = df.groupby('obs').size()
    
    return grouped.reset_index()

In [44]:
X_train_df = collapse_obs(df_train, feature_cols, aggs)
X_test_df = collapse_obs(df_test, feature_cols, aggs)

print(f"Train: number of observations: {X_train_df.shape[0]} with {X_train_df.shape[1]-1} features")
print(f"Test:  number of observations: {X_test_df.shape[0]} with {X_test_df.shape[1]-1} features")
X_train_df.head()

Train: number of observations: 14420 with 66 features
Test:  number of observations: 3450 with 66 features


Unnamed: 0,obs,num_0_mean,num_0_std,num_0_min,num_0_max,num_0_last,num_1_mean,num_1_std,num_1_min,num_1_max,...,t_3_std,t_3_min,t_3_max,t_3_last,t_4_mean,t_4_std,t_4_min,t_4_max,t_4_last,obs_length
0,0,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.990959,31.0,40.4,34.3,21.88,3.265113,14.3,28.8,17.7,30
1,1,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.888513,36.7,43.9,38.0,24.026667,2.768534,19.8,30.3,22.0,30
2,2,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,2.041526,34.9,42.2,36.1,21.453333,3.00594,17.5,29.0,19.1,30
3,3,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.079272,34.1,39.4,36.4,20.696667,1.928459,17.4,27.8,20.8,30
4,4,1.38,0.0,1.38,1.38,1.38,49.0,0.0,49,49,...,1.162854,32.5,38.8,32.5,21.5,1.469459,17.8,25.0,17.8,30


## 3. Extract Targets

In [45]:
y_train = df_train.groupby('obs')[['y_1', 'y_2']].first().reset_index(drop=True)

obs_to_patient = df_train.groupby('obs')['sub_id'].first().reset_index(drop=True)

print(f"Data frame shape: {y_train.shape}")
print(f"Number of unique patients: {obs_to_patient.nunique()}")
y_train.head()

Data frame shape: (14420, 2)
Number of unique patients: 1596


Unnamed: 0,y_1,y_2
0,33.4,107.4
1,25.76,92.76
2,19.94,85.56
3,14.14,83.28
4,24.58,83.92


## 4. Prepare Feature Matrices

In [46]:
test_obs = X_test_df['obs'].copy()
X_train = X_train_df.drop(columns=['obs'])
X_test = X_test_df.drop(columns=['obs'])

## 5. Hyperparameter Tuning with Cross-Validation

We'll use RandomizedSearchCV with GroupKFold to find optimal hyperparameters while properly evaluating generalization.

In [47]:
# Create GroupKFold for cross-validation
gkf = GroupKFold(n_splits=5)

# Define hyperparameter search space for y_1
param_grid_y1 = {
    'n_estimators': [500, 800, 1000, 1200],
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [3, 5, 7, 10],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [1, 1.5, 2.0],
}

def hyperparameter_search_with_cv(X, y, groups, param_grid, n_iter=50, n_splits=5, random_state=42):
    """
    Custom hyperparameter search with GroupKFold CV and proper early stopping for XGBoost.
    Returns best parameters and best CV score.
    """
    np.random.seed(random_state)
    gkf = GroupKFold(n_splits=n_splits)
    
    # Sample random parameter combinations
    param_combinations = []
    param_keys = list(param_grid.keys())
    param_values = [param_grid[k] for k in param_keys]
    
    for _ in range(n_iter):
        combo = {}
        for key, values in param_grid.items():
            combo[key] = np.random.choice(values)
        param_combinations.append(combo)
    
    best_score = float('inf')
    best_params = None
    results = []
    
    print(f"Testing {n_iter} parameter combinations with {n_splits}-fold CV...")
    
    for i, params in enumerate(param_combinations):
        cv_scores = []
        
        for fold, (tr_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
            X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                **params,
                tree_method='hist',
                early_stopping_rounds=50,
                random_state=random_state,
                n_jobs=-1
            )
            
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                verbose=False
            )
            
            val_preds = model.predict(X_val)
            fold_mae = mean_absolute_error(y_val, val_preds)
            cv_scores.append(fold_mae)
        
        mean_cv_score = np.mean(cv_scores)
        std_cv_score = np.std(cv_scores)
        results.append({
            'params': params,
            'mean_cv_score': mean_cv_score,
            'std_cv_score': std_cv_score,
            'cv_scores': cv_scores
        })
        
        if mean_cv_score < best_score:
            best_score = mean_cv_score
            best_params = params
        
        if (i + 1) % 10 == 0:
            print(f"  Completed {i+1}/{n_iter} combinations. Best CV MAE so far: {best_score:.4f}")
    
    print(f"\nBest CV MAE: {best_score:.4f}")
    return best_params, best_score, results

print("Starting hyperparameter search for y_1...")
print("This may take several minutes...\n")

best_params_y1, best_score_y1, search_results_y1 = hyperparameter_search_with_cv(
    X_train, 
    y_train['y_1'], 
    obs_to_patient,
    param_grid_y1,
    n_iter=50,
    n_splits=5,
    random_state=42
)

print("\n" + "="*60)
print("HYPERPARAMETER SEARCH RESULTS FOR y_1")
print("="*60)
print(f"Best CV Score (MAE): {best_score_y1:.4f}")
print(f"Best Parameters:")
for param, value in best_params_y1.items():
    print(f"  {param}: {value}")

# Store best parameters for later use
random_search_y1 = type('obj', (object,), {'best_params_': best_params_y1, 'best_score_': best_score_y1})()

# Out-of-fold validation to select champion model for y_1
print("\n" + "="*60)
print("OUT-OF-FOLD VALIDATION FOR y_1 (5-Fold CV)")
print("="*60)
print("Training models on each fold and collecting out-of-fold predictions...\n")

fold_models_y1 = []
oof_predictions_y1 = np.zeros(len(X_train))
cv_scores_y1 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_1'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_1'].iloc[tr_idx], y_train['y_1'].iloc[val_idx]
    
    # Train model with best hyperparameters
    model = xgb.XGBRegressor(**best_params_y1,
                             tree_method='hist',
                             early_stopping_rounds=50,
                             random_state=42)
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Make out-of-fold predictions
    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    fold_r2 = r2_score(y_val, val_preds)
    
    # Store predictions and model
    oof_predictions_y1[val_idx] = val_preds
    fold_models_y1.append(model)
    cv_scores_y1.append(fold_mae)
    
    print(f"Fold {i+1}: MAE = {fold_mae:.4f}, R² = {fold_r2:.4f}, Best iteration: {model.best_iteration}")

# Calculate overall out-of-fold score
oof_mae_y1 = mean_absolute_error(y_train['y_1'], oof_predictions_y1)
oof_r2_y1 = r2_score(y_train['y_1'], oof_predictions_y1)

print(f"\nOut-of-Fold Performance for y_1:")
print(f"  OOF MAE: {oof_mae_y1:.4f}")
print(f"  OOF R²: {oof_r2_y1:.4f}")
print(f"  Mean Fold MAE: {np.mean(cv_scores_y1):.4f} ± {np.std(cv_scores_y1):.4f}")
print(f"  Best Fold MAE: {np.min(cv_scores_y1):.4f}")
print(f"  Worst Fold MAE: {np.max(cv_scores_y1):.4f}")

# Create ensemble function that averages predictions from all fold models
def ensemble_predict_y1(X):
    """Average predictions from all 5 fold models for y_1"""
    predictions = np.zeros(len(X))
    for model in fold_models_y1:
        predictions += model.predict(X)
    return predictions / len(fold_models_y1)

# Store ensemble models for later use
ensemble_models_y1 = fold_models_y1
print(f"\n✓ Ensemble created for y_1: Averaging predictions from all {len(fold_models_y1)} fold models")
print(f"  Individual fold MAEs: {[f'{score:.4f}' for score in cv_scores_y1]}")
print(f"  Best fold MAE: {np.min(cv_scores_y1):.4f}, Worst fold MAE: {np.max(cv_scores_y1):.4f}")

Starting hyperparameter search for y_1...
This may take several minutes...

Testing 50 parameter combinations with 5-fold CV...
  Completed 10/50 combinations. Best CV MAE so far: 5.0440
  Completed 20/50 combinations. Best CV MAE so far: 5.0440
  Completed 30/50 combinations. Best CV MAE so far: 5.0440
  Completed 40/50 combinations. Best CV MAE so far: 5.0440
  Completed 50/50 combinations. Best CV MAE so far: 5.0440

Best CV MAE: 5.0440

HYPERPARAMETER SEARCH RESULTS FOR y_1
Best CV Score (MAE): 5.0440
Best Parameters:
  n_estimators: 1000
  learning_rate: 0.01
  max_depth: 4
  min_child_weight: 7
  subsample: 0.8
  colsample_bytree: 0.7
  reg_alpha: 1.0
  reg_lambda: 1.5

OUT-OF-FOLD VALIDATION FOR y_1 (5-Fold CV)
Training models on each fold and collecting out-of-fold predictions...

Fold 1: MAE = 4.9633, R² = 0.7548, Best iteration: 808
Fold 2: MAE = 5.0404, R² = 0.7035, Best iteration: 478
Fold 3: MAE = 4.9953, R² = 0.7425, Best iteration: 558
Fold 4: MAE = 5.1920, R² = 0.6947, 

## 6. Hyperparameter Tuning for y_2

In [48]:
# Define hyperparameter search space for y_2
param_grid_y2 = {
    'n_estimators': [500, 800, 1000, 1200],
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [3, 5, 7, 10],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [1, 1.5, 2.0],
}

print("Starting hyperparameter search for y_2...")
print("This may take several minutes...\n")

best_params_y2, best_score_y2, search_results_y2 = hyperparameter_search_with_cv(
    X_train, 
    y_train['y_2'], 
    obs_to_patient,
    param_grid_y2,
    n_iter=50,
    n_splits=5,
    random_state=42
)

print("\n" + "="*60)
print("HYPERPARAMETER SEARCH RESULTS FOR y_2")
print("="*60)
print(f"Best CV Score (MAE): {best_score_y2:.4f}")
print(f"Best Parameters:")
for param, value in best_params_y2.items():
    print(f"  {param}: {value}")

# Store best parameters for later use
random_search_y2 = type('obj', (object,), {'best_params_': best_params_y2, 'best_score_': best_score_y2})()

# Out-of-fold validation to select champion model for y_2
print("\n" + "="*60)
print("OUT-OF-FOLD VALIDATION FOR y_2 (5-Fold CV)")
print("="*60)
print("Training models on each fold and collecting out-of-fold predictions...\n")

fold_models_y2 = []
oof_predictions_y2 = np.zeros(len(X_train))
cv_scores_y2 = []

for i, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train['y_2'], groups=obs_to_patient)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train['y_2'].iloc[tr_idx], y_train['y_2'].iloc[val_idx]
    
    # Train model with best hyperparameters
    model = xgb.XGBRegressor(**best_params_y2,
                             tree_method='hist',
                             early_stopping_rounds=50,
                             random_state=42)
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Make out-of-fold predictions
    val_preds = model.predict(X_val)
    fold_mae = mean_absolute_error(y_val, val_preds)
    fold_r2 = r2_score(y_val, val_preds)
    
    # Store predictions and model
    oof_predictions_y2[val_idx] = val_preds
    fold_models_y2.append(model)
    cv_scores_y2.append(fold_mae)
    
    print(f"Fold {i+1}: MAE = {fold_mae:.4f}, R² = {fold_r2:.4f}, Best iteration: {model.best_iteration}")

# Calculate overall out-of-fold score
oof_mae_y2 = mean_absolute_error(y_train['y_2'], oof_predictions_y2)
oof_r2_y2 = r2_score(y_train['y_2'], oof_predictions_y2)

print(f"\nOut-of-Fold Performance for y_2:")
print(f"  OOF MAE: {oof_mae_y2:.4f}")
print(f"  OOF R²: {oof_r2_y2:.4f}")
print(f"  Mean Fold MAE: {np.mean(cv_scores_y2):.4f} ± {np.std(cv_scores_y2):.4f}")
print(f"  Best Fold MAE: {np.min(cv_scores_y2):.4f}")
print(f"  Worst Fold MAE: {np.max(cv_scores_y2):.4f}")

# Create ensemble function that averages predictions from all fold models
def ensemble_predict_y2(X):
    """Average predictions from all 5 fold models for y_2"""
    predictions = np.zeros(len(X))
    for model in fold_models_y2:
        predictions += model.predict(X)
    return predictions / len(fold_models_y2)

# Store ensemble models for later use
ensemble_models_y2 = fold_models_y2
print(f"\n✓ Ensemble created for y_2: Averaging predictions from all {len(fold_models_y2)} fold models")
print(f"  Individual fold MAEs: {[f'{score:.4f}' for score in cv_scores_y2]}")
print(f"  Best fold MAE: {np.min(cv_scores_y2):.4f}, Worst fold MAE: {np.max(cv_scores_y2):.4f}")

Starting hyperparameter search for y_2...
This may take several minutes...

Testing 50 parameter combinations with 5-fold CV...
  Completed 10/50 combinations. Best CV MAE so far: 3.5191
  Completed 20/50 combinations. Best CV MAE so far: 3.5191
  Completed 30/50 combinations. Best CV MAE so far: 3.5191
  Completed 40/50 combinations. Best CV MAE so far: 3.5191
  Completed 50/50 combinations. Best CV MAE so far: 3.5191

Best CV MAE: 3.5191

HYPERPARAMETER SEARCH RESULTS FOR y_2
Best CV Score (MAE): 3.5191
Best Parameters:
  n_estimators: 800
  learning_rate: 0.02
  max_depth: 4
  min_child_weight: 5
  subsample: 0.7
  colsample_bytree: 0.8
  reg_alpha: 1.0
  reg_lambda: 2.0

OUT-OF-FOLD VALIDATION FOR y_2 (5-Fold CV)
Training models on each fold and collecting out-of-fold predictions...

Fold 1: MAE = 3.5364, R² = 0.8610, Best iteration: 226
Fold 2: MAE = 3.9549, R² = 0.8482, Best iteration: 321
Fold 3: MAE = 3.4970, R² = 0.8847, Best iteration: 325
Fold 4: MAE = 3.3160, R² = 0.8829, B

## 7. Train Champion Models on Full Training Data

Now we train the final champion models using the best hyperparameters on the entire training set.

In [49]:
# Train champion models on full training data with best hyperparameters
print("Training champion model for y_1 on full training data...")
champion_model_y1 = xgb.XGBRegressor(**best_params_y1,
                                     tree_method='hist',
                                     early_stopping_rounds=50,
                                     random_state=42)

# Use a validation split for early stopping
gkf_temp = GroupKFold(n_splits=5)
train_idx, val_idx = next(gkf_temp.split(X_train, y_train['y_1'], groups=obs_to_patient))
X_tr_temp, X_val_temp = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_tr_temp, y_val_temp = y_train['y_1'].iloc[train_idx], y_train['y_1'].iloc[val_idx]

champion_model_y1.fit(
    X_tr_temp, y_tr_temp,
    eval_set=[(X_val_temp, y_val_temp)],
    verbose=False
)

print("Training champion model for y_2 on full training data...")
champion_model_y2 = xgb.XGBRegressor(**best_params_y2,
                                     tree_method='hist',
                                     early_stopping_rounds=50,
                                     random_state=42)

X_tr_temp, X_val_temp = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_tr_temp, y_val_temp = y_train['y_2'].iloc[train_idx], y_train['y_2'].iloc[val_idx]

champion_model_y2.fit(
    X_tr_temp, y_tr_temp,
    eval_set=[(X_val_temp, y_val_temp)],
    verbose=False
)

print("\nChampion models trained successfully!")
print(f"y_1 model best iteration: {champion_model_y1.best_iteration}")
print(f"y_2 model best iteration: {champion_model_y2.best_iteration}")

Training champion model for y_1 on full training data...
Training champion model for y_2 on full training data...

Champion models trained successfully!
y_1 model best iteration: 808
y_2 model best iteration: 226


In [50]:
## 8. Predict on Test Set & Build Submission

In [56]:
print("Making ensemble predictions by averaging all 5 fold models...")
y1_preds = ensemble_predict_y1(X_test)
y2_preds = ensemble_predict_y2(X_test)
submission = pd.DataFrame({
    'obs': test_obs,
    'y_1': y1_preds,
    'y_2': y2_preds,
})

print("Submission DataFrame:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"y_1 predictions - Mean: {y1_preds.mean():.2f}, Std: {y1_preds.std():.2f}")
print(f"y_2 predictions - Mean: {y2_preds.mean():.2f}, Std: {y2_preds.std():.2f}")
submission.to_csv('p2_test_submission.csv', index=False)
print("Submission saved to 'james_test_submission.csv'")

Making ensemble predictions by averaging all 5 fold models...
Submission DataFrame:
   obs        y_1         y_2
0   18  40.943437  105.240784
1   19  33.008796  100.355475
2   20  36.737520   96.149872
3   21  35.188933   95.985406
4   22  36.178406   94.742307

Submission shape: (3450, 3)
y_1 predictions - Mean: 42.36, Std: 11.46
y_2 predictions - Mean: 82.56, Std: 15.03
Submission saved to 'james_test_submission.csv'


In [51]:
# Make predictions on test set using champion models
y1_preds = champion_model_y1.predict(X_test)
y2_preds = champion_model_y2.predict(X_test)

submission = pd.DataFrame({
    'obs': test_obs,
    'y_1': y1_preds,
    'y_2': y2_preds,
})

print("Submission DataFrame:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"y_1 predictions - Mean: {y1_preds.mean():.2f}, Std: {y1_preds.std():.2f}")
print(f"y_2 predictions - Mean: {y2_preds.mean():.2f}, Std: {y2_preds.std():.2f}")

Submission DataFrame:
   obs        y_1         y_2
0   18  41.116669  104.552063
1   19  33.222843   99.894547
2   20  37.480644   96.115555
3   21  35.273235   96.061989
4   22  37.710545   94.731895

Submission shape: (3450, 3)
y_1 predictions - Mean: 42.41, Std: 11.58
y_2 predictions - Mean: 82.55, Std: 15.01


In [52]:
submission.to_csv('p2_test_submission.csv', index=False)
print("Submission saved to 'james_test_submission.csv'")

Submission saved to 'james_test_submission.csv'
