In [21]:
!pip install optuna



In [22]:
# =============================================================================
# COMPLETE XGBoost + Optuna Tuning Pipeline (GPU Accelerated)
# Playground Series S6E1 - Exam Score Prediction
# Includes: Strong Feature Engineering + Hyperparameter Tuning + Final 7-Fold Model
# =============================================================================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [23]:
# ----------------------------- Configuration ---------------------------------
SEED = 42
np.random.seed(SEED)

N_FOLDS_TUNE = 5
N_FOLDS_FINAL = 7
TARGET = 'exam_score'

In [24]:
# File paths (Kaggle environment)
TRAIN_PATH = "/kaggle/input/playground-series-s6e1/train.csv"
TEST_PATH = "/kaggle/input/playground-series-s6e1/test.csv"
ORIGINAL_PATH = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"
SUBMISSION_PATH = "/kaggle/input/playground-series-s6e1/sample_submission.csv"

In [25]:
# ----------------------------- Data Loading ----------------------------------
print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
original_df = pd.read_csv(ORIGINAL_PATH)
submission_df = pd.read_csv(SUBMISSION_PATH)

base_features = [col for col in train_df.columns if col not in ['id', TARGET]]

Loading data...


In [26]:
# ----------------------------- Feature Engineering ---------------------------
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Powerful hand-crafted feature
    df['feature_formula'] = (
        5.9051154511950499 * df['study_hours'] +
        0.34540967058057986 * df['class_attendance'] +
        1.423461171860262 * df['sleep_hours'] + 4.7819
    )
    
    # Polynomial features
    df['study_hours_squared'] = df['study_hours'] ** 2
    df['study_hours_cubed'] = df['study_hours'] ** 3
    df['class_attendance_squared'] = df['class_attendance'] ** 2
    df['sleep_hours_squared'] = df['sleep_hours'] ** 2
    df['age_squared'] = df['age'] ** 2
    
    # Log and sqrt transformations
    df['log_study_hours'] = np.log1p(df['study_hours'])
    df['log_class_attendance'] = np.log1p(df['class_attendance'])
    df['log_sleep_hours'] = np.log1p(df['sleep_hours'])
    df['sqrt_study_hours'] = np.sqrt(df['study_hours'])
    df['sqrt_class_attendance'] = np.sqrt(df['class_attendance'])
    
    # Convert original columns to string for XGBoost categorical support
    for col in base_features:
        df[col] = df[col].astype(str)
    
    engineered_numeric = [
        'feature_formula', 'study_hours_squared', 'study_hours_cubed',
        'class_attendance_squared', 'sleep_hours_squared', 'age_squared',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance'
    ]
    
    return df[base_features + engineered_numeric]

print("Preprocessing data...")
X = preprocess(train_df)
y = train_df[TARGET].values

X_test = preprocess(test_df)
X_original = preprocess(original_df)
y_original = original_df[TARGET].values

Preprocessing data...


In [27]:
# Ensure consistent categorical dtypes
full_data = pd.concat([X, X_test, X_original], axis=0)
for col in base_features:
    full_data[col] = full_data[col].astype('category')

engineered_cols = [c for c in full_data.columns if c not in base_features]
for col in engineered_cols:
    full_data[col] = full_data[col].astype(float)
    

In [28]:
# Split back
X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df)+len(test_df)].copy()
X_original = full_data.iloc[len(train_df)+len(test_df):].copy()

In [29]:
# ----------------------------- Optuna Objective Function -----------------------------
def objective(trial):
    params = {
        'n_estimators': 3000,  # Reduced for faster tuning (final model uses 10000)
        'learning_rate': trial.suggest_float('learning_rate', 0.003, 0.02, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        
        'tree_method': 'hist',
        'device': 'cuda',
        
        'random_state': SEED,
        'eval_metric': 'rmse',
        'early_stopping_rounds': 150,
        'enable_categorical': True,
        'verbosity': 0
    }
    
    kf = KFold(n_splits=N_FOLDS_TUNE, shuffle=True, random_state=SEED)
    oof_vals = np.zeros(len(X))
    
    for train_idx, val_idx in kf.split(X):
        X_tr = X.iloc[train_idx]
        y_tr = y[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y[val_idx]
        
        X_tr_full = pd.concat([X_tr, X_original], axis=0)
        y_tr_full = np.concatenate([y_tr, y_original])
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_tr_full, y_tr_full, eval_set=[(X_val, y_val)], verbose=False)
        
        oof_vals[val_idx] = model.predict(X_val)
    
    # FIXED: Compatible with all sklearn versions
    return np.sqrt(mean_squared_error(y, oof_vals))

In [30]:
# ----------------------------- Run Optuna Tuning -----------------------------
print("\n=== Starting Optuna Tuning ===")
sampler = TPESampler(seed=SEED)
pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(direction='minimize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=20)  

print("\n=== BEST TRIAL ===")
print(f"Best OOF RMSE: {study.best_value:.5f}")
print("Best parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[32m[I 2026-01-05 10:14:28,227][0m A new study created in memory with name: no-name-7f2ad01b-f04b-4b1f-ba3c-ff2efb1411c2[0m



=== Starting Optuna Tuning ===


[32m[I 2026-01-05 10:24:17,893][0m Trial 0 finished with value: 8.645682578938827 and parameters: {'learning_rate': 0.006105315793640882, 'max_depth': 10, 'subsample': 0.892797576724562, 'colsample_bytree': 0.7993292420985183, 'colsample_bynode': 0.5780093202212182, 'reg_lambda': 2.403950683025824, 'reg_alpha': 0.2904180608409973, 'min_child_weight': 17.457346769723767, 'gamma': 3.005575058716044}. Best is trial 0 with value: 8.645682578938827.[0m
[32m[I 2026-01-05 10:25:20,658][0m Trial 1 finished with value: 8.67478165379482 and parameters: {'learning_rate': 0.01149498584437033, 'max_depth': 5, 'subsample': 0.9879639408647978, 'colsample_bytree': 0.9162213204002109, 'colsample_bynode': 0.6061695553391381, 'reg_lambda': 2.636424704863906, 'reg_alpha': 0.9170225492671691, 'min_child_weight': 6.780602616231217, 'gamma': 2.6237821581611893}. Best is trial 0 with value: 8.645682578938827.[0m
[32m[I 2026-01-05 10:28:52,731][0m Trial 2 finished with value: 8.642870812411399 and para


=== BEST TRIAL ===
Best OOF RMSE: 8.63865
Best parameters:
  learning_rate: 0.00419151003863029
  max_depth: 8
  subsample: 0.7424364893339974
  colsample_bytree: 0.512141581748615
  colsample_bynode: 0.7171781358782056
  reg_lambda: 4.301600071004415
  reg_alpha: 3.0976697156679953
  min_child_weight: 13.841288416331665
  gamma: 0.12418376001879206


In [31]:
# ----------------------------- Train Final Model with Best Params -----------------------------
best_params = {
    'n_estimators': 10000,                    
    'early_stopping_rounds': 150,
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda',                         
    'enable_categorical': True,
    'random_state': SEED,
    'verbosity': 0
}
best_params.update(study.best_params)         

print(f"\n=== Training Final {N_FOLDS_FINAL}-Fold Model with Tuned Parameters ===")
print("Best parameters used:")
for k, v in best_params.items():
    if k in study.best_params:                
        print(f"   {k}: {v}  ‚Üê tuned")
    else:
        print(f"   {k}: {v}")

kf = KFold(n_splits=N_FOLDS_FINAL, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
fold_rmses = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold + 1}/{N_FOLDS_FINAL} ---")
    
    X_train = X.iloc[train_idx]
    y_train = y[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y[val_idx]
    
    # Combine with original data for training
    X_train_full = pd.concat([X_train, X_original], axis=0)
    y_train_full = np.concatenate([y_train, y_original])
    
    model = xgb.XGBRegressor(**best_params)
    model.fit(
        X_train_full, y_train_full,
        eval_set=[(X_val, y_val)],
        verbose=1000
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    oof_predictions[val_idx] = val_pred
    
    # FIXED RMSE calculation (compatible with all sklearn versions)
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_rmses.append(fold_rmse)
    print(f"Fold {fold + 1} RMSE: {fold_rmse:.5f}")
    
    # Accumulate test predictions
    test_predictions += model.predict(X_test) / N_FOLDS_FINAL


=== Training Final 7-Fold Model with Tuned Parameters ===
Best parameters used:
   n_estimators: 10000
   early_stopping_rounds: 150
   eval_metric: rmse
   tree_method: hist
   device: cuda
   enable_categorical: True
   random_state: 42
   verbosity: 0
   learning_rate: 0.00419151003863029  ‚Üê tuned
   max_depth: 8  ‚Üê tuned
   subsample: 0.7424364893339974  ‚Üê tuned
   colsample_bytree: 0.512141581748615  ‚Üê tuned
   colsample_bynode: 0.7171781358782056  ‚Üê tuned
   reg_lambda: 4.301600071004415  ‚Üê tuned
   reg_alpha: 3.0976697156679953  ‚Üê tuned
   min_child_weight: 13.841288416331665  ‚Üê tuned
   gamma: 0.12418376001879206  ‚Üê tuned

--- Fold 1/7 ---
[0]	validation_0-rmse:18.79980
[1000]	validation_0-rmse:8.65891
[2000]	validation_0-rmse:8.60402
[2705]	validation_0-rmse:8.60285
Fold 1 RMSE: 8.60274

--- Fold 2/7 ---
[0]	validation_0-rmse:18.86033
[1000]	validation_0-rmse:8.70398
[2000]	validation_0-rmse:8.64449
[3000]	validation_0-rmse:8.64218
[3040]	validation_0-rmse:8

In [32]:
# ----------------------------- Final Results ---------------------------------------
oof_rmse = np.sqrt(mean_squared_error(y, oof_predictions))

print("\n" + "="*60)
print(f"FINAL TUNED OOF RMSE: {oof_rmse:.5f}")
print(f"Mean Fold RMSE: {np.mean(fold_rmses):.5f} ¬± {np.std(fold_rmses):.5f}")
print("="*60)


FINAL TUNED OOF RMSE: 8.63445
Mean Fold RMSE: 8.63442 ¬± 0.02037


In [33]:
# Save OOF and submission
oof_df = pd.DataFrame({'id': train_df['id'], TARGET: oof_predictions})
oof_df.to_csv('xgb_tuned_oof.csv', index=False)

submission_df[TARGET] = np.clip(test_predictions, 0, 100)
submission_df.to_csv('submission_xgb_tuned.csv', index=False)

print("\nSubmission head:")
print(submission_df.head())

print("\nFiles saved:")
print("  - xgb_tuned_oof.csv")
print("  - submission_xgb_tuned.csv")
print("\nSubmit 'submission_xgb_tuned.csv' to the leaderboard for your improved score! üöÄ")


Submission head:
       id  exam_score
0  630000   68.926657
1  630001   70.045183
2  630002   90.467629
3  630003   56.486891
4  630004   45.580549

Files saved:
  - xgb_tuned_oof.csv
  - submission_xgb_tuned.csv

Submit 'submission_xgb_tuned.csv' to the leaderboard for your improved score! üöÄ
