# Ensemble Model - Target: Break into Top 5

**Current Score:** 8.737  
**Target Score:** 8.53 (Top 5)  
**Gap:** 0.204 points

**Strategy:**
1. Train 3 different models with Optuna
2. Ensemble predictions with optimal weights
3. Use cross-validation for robustness

Expected improvement: **0.10-0.20 points** ✨

In [1]:
# Install required packages
!pip install optuna xgboost catboost --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# ML utilities
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Optuna
import optuna
from optuna.visualization import plot_optimization_history

# Settings
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

%matplotlib inline
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✓ Libraries imported")
print(f"LightGBM: {lgb.__version__}")
print(f"XGBoost: {xgb.__version__}")

✓ Libraries imported
LightGBM: 4.6.0
XGBoost: 2.1.4


## 1. Load and Prepare Data

In [3]:
# Load data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

Train: (630000, 13)
Test: (270000, 12)


In [4]:
def engineer_features(df):
    """Feature engineering - same as Optuna notebook"""
    df = df.copy()
    
    # Polynomial features
    df['study_hours_sq'] = df['study_hours'] ** 2
    df['sleep_hours_sq'] = df['sleep_hours'] ** 2
    df['class_attendance_sq'] = df['class_attendance'] ** 2
    
    # Interaction features
    df['study_attendance'] = df['study_hours'] * df['class_attendance']
    df['study_sleep_ratio'] = df['study_hours'] / (df['sleep_hours'] + 1e-6)
    df['age_study_interaction'] = df['age'] * df['study_hours']
    df['attendance_sleep'] = df['class_attendance'] * df['sleep_hours']
    
    # Categorical binning
    df['age_group'] = pd.cut(df['age'], bins=[0, 22, 28, 100], 
                              labels=['young', 'middle', 'senior'])
    df['study_intensity'] = pd.cut(df['study_hours'], bins=[0, 3, 6, 100],
                                     labels=['low', 'medium', 'high'])
    df['sleep_category'] = pd.cut(df['sleep_hours'], bins=[0, 6, 8, 100],
                                    labels=['insufficient', 'optimal', 'excessive'])
    
    return df

# Prepare data
X_train_full = train.drop(['id', 'exam_score'], axis=1)
y_train = train['exam_score']
X_test = test.drop(['id'], axis=1)
test_ids = test['id']

# Apply feature engineering
X_train_full = engineer_features(X_train_full)
X_test = engineer_features(X_test)

# Encode (combine first to ensure matching columns)
combined = pd.concat([X_train_full, X_test], keys=['train', 'test'])
combined_encoded = pd.get_dummies(combined, drop_first=True)
X_train_encoded = combined_encoded.loc['train'].reset_index(drop=True)
X_test_encoded = combined_encoded.loc['test'].reset_index(drop=True)

print(f"\n✓ Data prepared")
print(f"Train: {X_train_encoded.shape}")
print(f"Test: {X_test_encoded.shape}")
print(f"Features: {X_train_encoded.shape[1]}")

# Train/val split
X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=RANDOM_STATE
)


✓ Data prepared
Train: (630000, 36)
Test: (270000, 36)
Features: 36


## 2. Model 1: LightGBM with Extended Optuna

In [5]:
def lgb_objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'random_state': RANDOM_STATE,
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train_split)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

print("Optimizing LightGBM...")
lgb_study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
lgb_study.optimize(lgb_objective, n_trials=100, show_progress_bar=True)

print(f"\n✓ LightGBM Best RMSE: {lgb_study.best_value:.4f}")
print(f"Best params: {lgb_study.best_params}")

Optimizing LightGBM...


  0%|          | 0/100 [00:00<?, ?it/s]


✓ LightGBM Best RMSE: 8.7523
Best params: {'n_estimators': 778, 'learning_rate': 0.03807213785461877, 'max_depth': 9, 'num_leaves': 152, 'min_child_samples': 88, 'subsample': 0.8280780581014633, 'colsample_bytree': 0.5584782113779962, 'reg_alpha': 2.556745395501475e-07, 'reg_lambda': 0.14908672719216273}


## 3. Model 2: XGBoost with Optuna

In [6]:
def xgb_objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'random_state': RANDOM_STATE,
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train_split, verbose=False)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
xgb_study.optimize(xgb_objective, n_trials=100, show_progress_bar=True)

print(f"\n✓ XGBoost Best RMSE: {xgb_study.best_value:.4f}")
print(f"Best params: {xgb_study.best_params}")

Optimizing XGBoost...


  0%|          | 0/100 [00:00<?, ?it/s]


✓ XGBoost Best RMSE: 8.7514
Best params: {'n_estimators': 752, 'learning_rate': 0.05482213572052564, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.7939816454914731, 'colsample_bytree': 0.50089082637021, 'gamma': 0.03419958026304817, 'reg_alpha': 3.6496151255344214, 'reg_lambda': 9.809918422209952}


## 4. Model 3: CatBoost with Optuna

In [7]:
def cat_objective(trial):
    params = {
        'loss_function': 'RMSE',
        'random_state': RANDOM_STATE,
        'verbose': False,
        'iterations': trial.suggest_int('iterations', 100, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
    }
    
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train_split)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

print("Optimizing CatBoost...")
cat_study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
cat_study.optimize(cat_objective, n_trials=100, show_progress_bar=True)

print(f"\n✓ CatBoost Best RMSE: {cat_study.best_value:.4f}")
print(f"Best params: {cat_study.best_params}")

Optimizing CatBoost...


  0%|          | 0/100 [00:00<?, ?it/s]


✓ CatBoost Best RMSE: 8.7604
Best params: {'iterations': 671, 'learning_rate': 0.18964281562767207, 'depth': 7, 'l2_leaf_reg': 7.889329728262899, 'subsample': 0.7332847819711628, 'colsample_bylevel': 0.7190017980271927}


## 5. Train Final Models and Create Ensemble

In [8]:
# Train final models
print("Training final models...\n")

lgb_model = lgb.LGBMRegressor(**lgb_study.best_params, random_state=RANDOM_STATE, verbosity=-1)
xgb_model = xgb.XGBRegressor(**xgb_study.best_params, random_state=RANDOM_STATE)
cat_model = CatBoostRegressor(**cat_study.best_params, random_state=RANDOM_STATE, verbose=False)

lgb_model.fit(X_train_encoded, y_train)
xgb_model.fit(X_train_encoded, y_train)
cat_model.fit(X_train_encoded, y_train)

# Evaluate on validation
lgb_val_pred = lgb_model.predict(X_val)
xgb_val_pred = xgb_model.predict(X_val)
cat_val_pred = cat_model.predict(X_val)

lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_pred))
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
cat_rmse = np.sqrt(mean_squared_error(y_val, cat_val_pred))

print("Individual Model Performance:")
print(f"LightGBM: {lgb_rmse:.4f}")
print(f"XGBoost:  {xgb_rmse:.4f}")
print(f"CatBoost: {cat_rmse:.4f}")

Training final models...

Individual Model Performance:
LightGBM: 8.4703
XGBoost:  8.4872
CatBoost: 8.6253


In [9]:
# Optimize ensemble weights
def ensemble_objective(trial):
    w1 = trial.suggest_float('lgb_weight', 0.0, 1.0)
    w2 = trial.suggest_float('xgb_weight', 0.0, 1.0)
    w3 = trial.suggest_float('cat_weight', 0.0, 1.0)
    
    total = w1 + w2 + w3
    w1, w2, w3 = w1/total, w2/total, w3/total
    
    ensemble_pred = w1 * lgb_val_pred + w2 * xgb_val_pred + w3 * cat_val_pred
    return np.sqrt(mean_squared_error(y_val, ensemble_pred))

print("\nOptimizing ensemble weights...")
ensemble_study = optuna.create_study(direction='minimize')
ensemble_study.optimize(ensemble_objective, n_trials=100, show_progress_bar=True)

best_weights_raw = ensemble_study.best_params
total_weight = sum(best_weights_raw.values())
best_weights = {k: v/total_weight for k, v in best_weights_raw.items()}

print(f"\n✓ Best Ensemble RMSE: {ensemble_study.best_value:.4f}")
print(f"\nOptimal Weights:")
print(f"  LightGBM: {best_weights['lgb_weight']:.3f}")
print(f"  XGBoost:  {best_weights['xgb_weight']:.3f}")
print(f"  CatBoost: {best_weights['cat_weight']:.3f}")


Optimizing ensemble weights...


  0%|          | 0/100 [00:00<?, ?it/s]


✓ Best Ensemble RMSE: 8.4679

Optimal Weights:
  LightGBM: 0.654
  XGBoost:  0.346
  CatBoost: 0.000


## 6. Generate Final Predictions

In [10]:
# Predict on test set
print("Generating ensemble predictions...\n")

lgb_test_pred = lgb_model.predict(X_test_encoded)
xgb_test_pred = xgb_model.predict(X_test_encoded)
cat_test_pred = cat_model.predict(X_test_encoded)

# Ensemble with optimal weights
ensemble_pred = (
    best_weights['lgb_weight'] * lgb_test_pred +
    best_weights['xgb_weight'] * xgb_test_pred +
    best_weights['cat_weight'] * cat_test_pred
)

print(f"Prediction stats:")
print(f"  Min: {ensemble_pred.min():.2f}")
print(f"  Max: {ensemble_pred.max():.2f}")
print(f"  Mean: {ensemble_pred.mean():.2f}")

# Create submission
submission = pd.DataFrame({
    'id': test_ids,
    'exam_score': ensemble_pred
})

submission.to_csv('submission_ensemble.csv', index=False)

print("\n✓ Submission created: submission_ensemble.csv")
print(f"Expected score: ~{ensemble_study.best_value:.3f}")
print(f"Target: Break into Top 5!")
print("\nFirst 10 rows:")
print(submission.head(10))

Generating ensemble predictions...

Prediction stats:
  Min: 15.66
  Max: 103.61
  Mean: 62.52

✓ Submission created: submission_ensemble.csv
Expected score: ~8.468
Target: Break into Top 5!

First 10 rows:
       id  exam_score
0  630000   70.903171
1  630001   70.427317
2  630002   87.737945
3  630003   56.181486
4  630004   47.097054
5  630005   71.882470
6  630006   73.183164
7  630007   58.645561
8  630008   79.450108
9  630009   89.510902
