# Final Model — Hyperparameter Tuning & Submission
CW1 — Tuning top models, stacking, and generating test predictions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib.util
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin, clone
missing = [pkg for pkg in ('xgboost', 'lightgbm') if importlib.util.find_spec(pkg) is None]
if missing:
    raise ImportError(f"Missing dependency(ies): {', '.join(missing)}. Install with `pip install -r requirements.txt`.")

import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import os

def find_project_root() -> Path:
    """
    Resolve CW1 project root robustly.
    Priority:
    1) CW1_PROJECT_ROOT env var (if set)
    2) Current working directory, then each parent directory
    Root is identified by presence of both required data files.
    """
    candidates = []

    env_root = os.environ.get("CW1_PROJECT_ROOT")
    if env_root:
        candidates.append(Path(env_root).expanduser().resolve())

    cwd = Path.cwd().resolve()
    candidates.extend([cwd, *cwd.parents])

    checked = []
    for root in candidates:
        checked.append(root)
        train_path = root / "data" / "CW1_train.csv"
        test_path = root / "data" / "CW1_test.csv"
        if train_path.is_file() and test_path.is_file():
            return root

    checked_msg = "\n".join(f"  - {p}" for p in checked)
    raise FileNotFoundError(
        "Could not locate CW1 project root.\n"
        "Expected files:\n"
        "  - data/CW1_train.csv\n"
        "  - data/CW1_test.csv\n"
        f"Current working directory: {cwd}\n"
        "Directories checked:\n"
        f"{checked_msg}\n"
        "If needed, set CW1_PROJECT_ROOT to your repo root and rerun this cell."
    )

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def require_file(path: Path, label: str = "Required file") -> Path:
    if not path.is_file():
        raise FileNotFoundError(
            f"{label} not found: {path}\n"
            f"PROJECT_ROOT resolved to: {PROJECT_ROOT}\n"
            "Check file names and repository structure."
        )
    return path

print(f"PROJECT_ROOT: {PROJECT_ROOT}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

sns.set_style('whitegrid')
np.random.seed(42)


## 1. Data Preparation

In [None]:
trn = pd.read_csv(require_file(DATA_DIR / 'CW1_train.csv', 'Training CSV'))
tst = pd.read_csv(require_file(DATA_DIR / 'CW1_test.csv', 'Test CSV'))

y = trn['outcome']
X = trn.drop(columns=['outcome'])

# One-hot encode categoricals (same for train and test)
X = pd.get_dummies(X, columns=['cut', 'color', 'clarity'], drop_first=True)
X_tst = pd.get_dummies(tst, columns=['cut', 'color', 'clarity'], drop_first=True)

# Ensure same columns in train and test
X_tst = X_tst.reindex(columns=X.columns, fill_value=0)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(f"Train: {X.shape}, Test: {X_tst.shape}")

## 2. Tune Histogram Gradient Boosting

In [None]:
gb_params = {
    'max_iter': [300, 500, 800, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [5, 10, 20, 30],
    'max_features': [0.5, 0.8, 1.0],
}

gb_search = RandomizedSearchCV(
    HistGradientBoostingRegressor(random_state=42),
    gb_params,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
gb_search.fit(X, y)

print(f"Best R²: {gb_search.best_score_:.4f}")
print(f"Best params: {gb_search.best_params_}")

## 3. Tune LightGBM

In [None]:
lgb_params = {
    'n_estimators': [300, 500, 800, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [3, 5, 7, -1],
    'num_leaves': [15, 31, 63, 127],
    'min_child_samples': [5, 10, 20, 30],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1.0],
    'reg_lambda': [0, 0.01, 0.1, 1.0],
}

lgb_search = RandomizedSearchCV(
    lgb.LGBMRegressor(random_state=42, verbosity=-1, n_jobs=1),
    lgb_params,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
lgb_search.fit(X, y)

print(f"Best R²: {lgb_search.best_score_:.4f}")
print(f"Best params: {lgb_search.best_params_}")

## 4. Tune XGBoost

In [None]:
xgb_params = {
    'n_estimators': [300, 500, 800, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6, 8],
    'min_child_weight': [1, 3, 5, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1.0],
    'reg_lambda': [0.01, 0.1, 1.0, 10.0],
}

xgb_search = RandomizedSearchCV(
    xgb.XGBRegressor(random_state=42, verbosity=0, n_jobs=1),
    xgb_params,
    n_iter=60,
    cv=kf,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
xgb_search.fit(X, y)

print(f"Best R²: {xgb_search.best_score_:.4f}")
print(f"Best params: {xgb_search.best_params_}")

## 5. Tune Random Forest
Manual grid search (faster than RandomizedSearchCV for RF).

In [None]:
configs = [
    {'n_estimators': 500, 'max_depth': 15, 'min_samples_leaf': 5, 'max_features': 'sqrt'},
    {'n_estimators': 500, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 0.5},
    {'n_estimators': 500, 'max_depth': None, 'min_samples_leaf': 5, 'max_features': 'sqrt'},
]

best_rf_score = -np.inf
best_rf_params = None
for i, params in enumerate(configs):
    rf = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    scores = cross_val_score(rf, X, y, cv=kf, scoring='r2', n_jobs=1)
    print(f"Config {i+1}: R² = {scores.mean():.4f} ± {scores.std():.4f}  {params}")
    if scores.mean() > best_rf_score:
        best_rf_score = scores.mean()
        best_rf_params = params

print(f"\nBest RF config: {best_rf_params}")
best_rf = RandomForestRegressor(**best_rf_params, random_state=42, n_jobs=-1)

## 6. Tuned Model Comparison

In [None]:
tuned_models = {
    'Hist GB': gb_search.best_estimator_,
    'LightGBM': lgb_search.best_estimator_,
    'XGBoost': xgb_search.best_estimator_,
    'Random Forest': best_rf,
}

tuned_results = {}
for name, model in tuned_models.items():
    scores = cross_val_score(model, X, y, cv=kf, scoring='r2', n_jobs=1)
    tuned_results[name] = scores
    print(f"{name:25s}  R² = {scores.mean():.4f} ± {scores.std():.4f}")

## 7. Stacking Ensemble
Using the top models as base estimators with Ridge as the meta-learner.

In [None]:
stack = StackingRegressor(
    estimators=[
        ('gb', gb_search.best_estimator_),
        ('lgb', lgb_search.best_estimator_),
        ('xgb', xgb_search.best_estimator_),
        ('rf', best_rf),
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=kf,
    n_jobs=1,
)

stack_scores = cross_val_score(stack, X, y, cv=kf, scoring='r2', n_jobs=1)
print(f"Stacking Ensemble          R² = {stack_scores.mean():.4f} ± {stack_scores.std():.4f}")

## 8. Simple Blending
Average predictions from the top models — sometimes more robust than stacking.

In [None]:
class BlendingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights

    def fit(self, X, y):
        self.fitted_models_ = []
        for model in self.models:
            m = clone(model)
            m.fit(X, y)
            self.fitted_models_.append(m)
        return self

    def predict(self, X):
        preds = np.column_stack([m.predict(X) for m in self.fitted_models_])
        if self.weights is not None:
            return np.average(preds, axis=1, weights=self.weights)
        return preds.mean(axis=1)

blend = BlendingRegressor([
    gb_search.best_estimator_,
    lgb_search.best_estimator_,
    xgb_search.best_estimator_,
    best_rf,
])

blend_scores = cross_val_score(blend, X, y, cv=kf, scoring='r2', n_jobs=1)
print(f"Blending (equal weights)   R² = {blend_scores.mean():.4f} ± {blend_scores.std():.4f}")

## 9. Final Summary

In [None]:
all_results = {**tuned_results}
all_results['Stacking'] = stack_scores
all_results['Blending'] = blend_scores

summary = pd.DataFrame({
    'Model': all_results.keys(),
    'Mean R²': [v.mean() for v in all_results.values()],
    'Std R²': [v.std() for v in all_results.values()]
}).sort_values('Mean R²', ascending=False).reset_index(drop=True)

print(summary.to_string(index=False))

fig, ax = plt.subplots(figsize=(10, 5))
pd.DataFrame(all_results).boxplot(ax=ax, vert=False)
ax.set_xlabel('R² (5-fold CV)')
ax.set_title('Final Model Comparison (Tuned)')
plt.tight_layout()
plot_path = OUTPUT_DIR / 'final_model_comparison.png'
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"Saved plot to: {plot_path}")
plt.show()

## 10. Generate Submission
Select the best model and produce the submission CSV.

In [None]:
best_name = summary.iloc[0]['Model']
print(f"Best model: {best_name}")

if best_name == 'Stacking':
    best_model = stack
elif best_name == 'Blending':
    best_model = blend
else:
    best_model = tuned_models[best_name]

# Fit on full training data and predict
final_model = clone(best_model)
final_model.fit(X, y)
yhat = final_model.predict(X_tst)

# Save submission
student_id = 'k23051742'
submission_path = OUTPUT_DIR / f'CW1_submission_{student_id}.csv'
out = pd.DataFrame({'yhat': yhat})
out.to_csv(submission_path, index=False)
print(f"Saved submission to: {submission_path}")
print(f"Submission saved: {len(yhat)} predictions")
print(f"Prediction range: [{yhat.min():.2f}, {yhat.max():.2f}]")
print(f"Prediction mean: {yhat.mean():.2f}")