# Notebook 6: Phase 2 - Hyperparameter Tuning

**Goal**: Optimize the hyperparameters of our best performing models (Linear Regression & XGBoost) using `GridSearchCV` / `RandomizedSearchCV`.
**Constraint**: Must use `TimeSeriesSplit` for Cross-Validation to avoid data leakage (no future data in validation folds).

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))
from preprocessing import load_data

SPLIT_YEAR = 2015
TARGET = 'Value_co2_emissions_kt_by_country'

# Load Data
df_lr = load_data('../data/processed/lr_final_prep.csv')
df_xgb = load_data('../data/processed/xgb_final_prep.csv')

# Ensure Year col exists for splitting logic
# (Assuming Year is present or recovering it if needed)
df_common = load_data('../data/processed/common_preprocessed.csv')
if 'Year' not in df_lr.columns:
    df_lr['Year'] = df_common.loc[df_lr.index, 'Year']
if 'Year' not in df_xgb.columns:
    df_xgb['Year'] = df_common.loc[df_xgb.index, 'Year']

Loaded data from ../data/processed/lr_final_prep.csv: (2190, 193)
Loaded data from ../data/processed/xgb_final_prep.csv: (3473, 25)
Loaded data from ../data/processed/common_preprocessed.csv: (3473, 25)


In [2]:
def tuning_pipeline(df, model, param_grid, name="Model"):
    print(f"\n--- Tuning {name} ---")
    
    # Split Train/Test for Final Eval
    train = df[df['Year'] < SPLIT_YEAR]
    test = df[df['Year'] >= SPLIT_YEAR]
    
    drop_cols = [TARGET, 'Year']
    drop_cols = [c for c in drop_cols if c in df.columns]
    
    X_train = train.drop(columns=drop_cols)
    y_train = train[TARGET]
    X_test = test.drop(columns=drop_cols)
    y_test = test[TARGET]
    
    # TimeSeriesSplit for CV
    # We use 5 splits. This effectively walks forward in time.
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Search
    # Use RandomizedSearchCV for speed if grid is large, else Grid
    if len(param_grid) > 20: 
        search = RandomizedSearchCV(
            model, param_grid, cv=tscv, scoring='r2', 
            n_iter=20, n_jobs=-1, random_state=42, verbose=1
        )
    else:
        search = GridSearchCV(
            model, param_grid, cv=tscv, scoring='r2', 
            n_jobs=-1, verbose=1
        )
        
    search.fit(X_train, y_train)
    
    print(f"Best Params: {search.best_params_}")
    print(f"Best CV Score: {search.best_score_:.4f}")
    
    # Evaluate on Test Set
    best_model = search.best_estimator_
    preds = best_model.predict(X_test)
    test_r2 = r2_score(y_test, preds)
    print(f"Test Set R2 (Before Tuning): Comparison needed with Phase 1")
    print(f"Test Set R2 (After Tuning): {test_r2:.4f}")
    
    return search.best_params_, test_r2

# 1. Tune Linear Regression (Ridge)
lr_params = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
}
best_lr_params, lr_score = tuning_pipeline(df_lr, Ridge(), lr_params, "Linear Regression (Ridge)")

# 2. Tune XGBoost
xgb_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}
best_xgb_params, xgb_score = tuning_pipeline(df_xgb, XGBRegressor(random_state=42, n_jobs=-1), xgb_params, "XGBoost")


--- Tuning Linear Regression (Ridge) ---
Fitting 5 folds for each of 5 candidates, totalling 25 fits


Best Params: {'alpha': 10.0}
Best CV Score: 0.8931
Test Set R2 (Before Tuning): Comparison needed with Phase 1
Test Set R2 (After Tuning): 0.9804

--- Tuning XGBoost ---
Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best Params: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
Best CV Score: 0.7527
Test Set R2 (Before Tuning): Comparison needed with Phase 1
Test Set R2 (After Tuning): 0.7996


In [3]:
# Save Best Params for Phase 3 v2
import json

best_params = {
    'Linear Regression': best_lr_params,
    'XGBoost': best_xgb_params
}

with open('../data/results/best_hyperparameters.json', 'w') as f:
    json.dump(best_params, f, indent=4)
    
print("Best parameters saved to data/results/best_hyperparameters.json")

Best parameters saved to data/results/best_hyperparameters.json
