In [1]:
import sys
import os

# Adjust this if your notebook is in a subfolder (like /notebooks)
project_root = os.path.abspath("..")  # or "../.." if deeper
sys.path.append(project_root)


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils.metrics import regression_metrics, quantile_coverage, print_metrics
from models.linear_model import load_features_and_split, train_linear_regression
from models.catboost_model import (train_catboost, train_catboost_quantiles, predict_interval)
from models.xgboost_model import load_xgb_data, train_xgboost
from models.random_forest import train_random_forest
import matplotlib.pyplot as plt
from config.paths import data_path
from models.lightgbm_model import (preprocess_for_lightgbm, load_lgbm_data, train_lightgbm, train_lightgbm_quantiles, predict_interval)

General train/test split

Random forest

In [None]:
from models.random_forest import load_rf_data, train_random_forest 
# Load data
X_train, X_test, y_train, y_test = load_rf_data()

# Set parameters
rf_params = {
    "n_estimators": 300,
    "max_depth": None,  # grow until all leaves are pure or min_samples rules apply
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "random_state": 42
}
# Train and evaluate
rf_model = train_random_forest(X_train, y_train, rf_params=rf_params)
y_pred_rf = rf_model.predict(X_test)
print_metrics("Random Forest", regression_metrics(y_test, y_pred_rf))

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]

# Table
feat_imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

display(feat_imp_df.head(10))

Linear regression metrics and feature importance

In [None]:
from config.paths import data_path
from models.linear_model import load_features_and_split, train_linear_regression
from utils.metrics import regression_metrics, summarize_coefficients

def print_metrics(model_name, metrics_dict):
    print(f"\n📊 Performance for {model_name}")
    for k, v in metrics_dict.items():
        print(f"{k}: {v:.4f}")

# Choose input file
processed_file = data_path("selected_features_linear.json")

# Load data
X_train, X_test, y_train, y_test = load_features_and_split(processed_file)

# Train model
linreg_model = train_linear_regression(X_train, y_train)

# Predict
y_pred_lin = linreg_model.predict(X_test)

# Evaluate
metrics = regression_metrics(y_test, y_pred_lin)
print_metrics("Linear Regression", metrics)

# Show coefficients + plot
summarize_coefficients(linreg_model, X_train)


Catboost

In [None]:
# ------------------ 1. Load and split data ------------------ #

df = pd.read_json(data_path("selected_features_CB.json"))
X = df.drop(columns=["startupDelay", "id"])
y = df["startupDelay"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cat_features = ["pillar", "countryCoor"]  # Categorical features as column names

In [None]:
cb_params = {
    "iterations": 300,
    "depth": 6,
    "learning_rate": 0.05,
    "random_state": 42
}

cb_quantile_params = cb_params.copy()

In [None]:
# ------------------ 3. Train single-value CatBoost ------------------ #

cb_model = train_catboost(
    X_train, y_train,
    X_valid=X_test, y_valid=y_test,
    catboost_params=cb_params,
    cat_features=cat_features
)

y_pred_cb = cb_model.predict(X_test)

print_metrics("CatBoost (Point Prediction)", regression_metrics(y_test, y_pred_cb))

# ------------------ 4. Train quantile models ------------------ #

m_low = train_catboost_quantiles(
    X_train, y_train, alpha=0.1,
    X_valid=X_test, y_valid=y_test,
    catboost_quantile_params=cb_quantile_params,
    cat_features=cat_features
)

m_med = train_catboost_quantiles(
    X_train, y_train, alpha=0.5,
    X_valid=X_test, y_valid=y_test,
    catboost_quantile_params=cb_quantile_params,
    cat_features=cat_features
)

m_high = train_catboost_quantiles(
    X_train, y_train, alpha=0.9,
    X_valid=X_test, y_valid=y_test,
    catboost_quantile_params=cb_quantile_params,
    cat_features=cat_features
)

In [None]:
# ------------------ 5. Evaluate quantile predictions ------------------ #

intervals = predict_interval(m_low, m_med, m_high, X_test)

print_metrics("CatBoost Quantile (Median)", regression_metrics(y_test, intervals[:, 1]))

coverage_stats = quantile_coverage(y_test.values, intervals)
print_metrics("Prediction Interval", coverage_stats)


In [None]:
# ------------------ 6. Feature Importances ------------------ #

feat_imp = cb_model.get_feature_importance(prettified=True)

plt.figure(figsize=(10, 6))
plt.barh(feat_imp["Feature Id"], feat_imp["Importances"])
plt.gca().invert_yaxis()
plt.title("CatBoost Feature Importances (Point Prediction Model)")
plt.tight_layout()
plt.show()

XGBoost

In [None]:

# Load data
X_train, X_test, y_train, y_test = load_xgb_data()

# Set parameters
xgb_params = {
    "n_estimators": 300,
    "max_depth": 6,
    "learning_rate": 0.05,
    "random_state": 42
}

# Train and evaluate
xgb_model = train_xgboost(X_train, y_train, xgb_params)
y_pred_xgb = xgb_model.predict(X_test)

print_metrics("XGBoost", regression_metrics(y_test, y_pred_xgb))


In [None]:
# Extract importance values
importances = xgb_model.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]

# Tabular display (optional)
feat_imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

display(feat_imp_df.head(10))

LightGBM

In [None]:
# Load data
X_train, X_test, y_train, y_test = load_lgbm_data()

# Regular training
params = {
    "n_estimators": 300,
    "learning_rate": 0.05,
    "num_leaves": 15,
    "max_depth": 6,
    "min_data_in_leaf": 30,
    "random_state": 42
}
model = train_lightgbm(X_train, y_train, lgbm_params=params)
y_pred = model.predict(X_test)
print_metrics("LightGBM Point", regression_metrics(y_test, y_pred))

# Quantile regression
models_q = train_lightgbm_quantiles(X_train, y_train, base_params=params)
intervals = predict_interval(models_q, X_test)
print_metrics("LightGBM Quantile (Median)", regression_metrics(y_test, intervals[:, 1]))
print_metrics("Prediction Interval", quantile_coverage(y_test.values, intervals))

In [None]:


importances = model.feature_importances_
features = X_train.columns
sorted_idx = importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.barh(features[sorted_idx], importances[sorted_idx])
plt.title("LightGBM Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [5]:
interval_widths = intervals[:, 2] - intervals[:, 0]
print(f"Avg interval width: {interval_widths.mean():.2f}")


Avg interval width: 220.37
