# Lab For Experimentation

In [1]:
import json
import warnings
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]
json.loads('{"name": "Bike Rental Prediction", "category": "A"}')

{'name': 'Bike Rental Prediction', 'category': 'A'}

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [5]:
rng = np.random.default_rng(123)
x = rng.standard_normal(size=(1_000, 10))

X_train, X_test = train_test_split(x, test_size=0.2, random_state=123)
y_train = rng.standard_normal(size=(X_train.shape[0],))
y_test = rng.standard_normal(size=(X_test.shape[0],))

params: dict[str, Any] = {
    "n_estimators": 100,
    "max_depth": 10,
}

rf_reg = RandomForestRegressor(**params)

rf_reg.fit(X_train, y_train)
# rf_reg.score(X_test, y_test)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
rf_reg.feature_importances_

array([0.1258, 0.0944, 0.0799, 0.1148, 0.0847, 0.1147, 0.093 , 0.1077,
       0.0912, 0.0938])

In [7]:
import xgboost as xgb


# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [8]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",  # for regression
    "eval_metric": ["rmse", "mae", "mape"],
    "learning_rate": 0.1,
    "max_depth": 6,
    "tree_method": "hist",  # Use 'hist' for CPU, 'gpu_hist' for GPU
}
n: int = 100

# Train the model
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [9]:
preds = model.predict(dtest_reg)
rmse = root_mean_squared_error(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 1.125


In [10]:
# Cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=20,
    nfold=5,
    metrics={"rmse"},
    seed=123,
    as_pandas=True,
    callbacks=[
        xgb.callback.EvaluationMonitor(show_stdv=True),
        xgb.callback.EarlyStopping(rounds=3),
    ],
)
console.print(cv_results)

[0]	train-rmse:0.97861+0.00793	test-rmse:1.00251+0.03045
[1]	train-rmse:0.95530+0.00922	test-rmse:0.99990+0.03146
[2]	train-rmse:0.93566+0.01087	test-rmse:0.99963+0.03280
[3]	train-rmse:0.91714+0.01274	test-rmse:1.00122+0.03374
[4]	train-rmse:0.90072+0.01365	test-rmse:1.00416+0.03115


In [11]:
# Find the optimal number of boosting rounds
best_num_rounds: int = len(cv_results)
console.print(f"Optimal boosting rounds: {best_num_rounds}", style="success")
console.print(cv_results.tail())

# Step 2: Train final model with optimal rounds
final_model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=best_num_rounds,
)

# Step 3: Evaluate on test set
test_preds = final_model.predict(dtest_reg)
test_rmse = root_mean_squared_error(y_test, test_preds)
console.print(f"Test RMSE: {test_rmse:.4f}", style="info")


# Step 4: Save the model
# final_model.save_model("xgboost_model.json")

In [12]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.978613,0.007929,1.002512,0.030454
1,0.955297,0.009222,0.999901,0.031463
2,0.935665,0.010871,0.99963,0.032802


In [13]:
type(final_model).__module__

'xgboost.core'

In [25]:
import httpx

try:
    async with httpx.AsyncClient() as client:
        r = await client.get("http://0.0.0.0:5001/#/experiments/list", timeout=10.0)

    if r.status_code == 200:
        console.print("MLflow Tracking Server is reachable!", style="success")

except Exception as e:
    console.print(f"Error: {e}", style="error")

In [26]:
from src.config.config import app_config
from src.ml.trainer import ModelTrainer
from src.ml.feature_engineering import FeatureEngineer

In [None]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")
display(data.head(2))


feat_eng = FeatureEngineer()
features_df: pl.DataFrame = feat_eng.create_all_features(
    data, config=app_config.feature_config
)
features_df.head(3)

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40


season,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,is_weekend,sin_hour,cos_hour,sin_weekday,cos_weekday,cnt_lag_0hr,cnt_lag_1hr,cnt_lag_24hr,hr_lag_1hr,hr_lag_24hr,temp_lag_1hr,temp_lag_3hr,hum_lag_1hr,hum_lag_3hr,temp_rolling_mean_3hr,temp_rolling_median_3hr,temp_rolling_mean_6hr,temp_rolling_median_6hr,hum_rolling_mean_3hr,hum_rolling_median_3hr,hum_rolling_mean_6hr,hum_rolling_median_6hr,temp_plus_hum,hum_plus_hr,cnt_diff_1hr,cnt_diff_2hr,hr_diff_1hr,hr_diff_24hr,temp_diff_1hr,temp_diff_2hr,temp_diff_24hr,hum_diff_1hr,hum_diff_2hr,is_high_temp,is_high_hum,is_peak_hour,is_working_hour,is_business_hour,target
i64,i64,i64,i64,i64,i64,i64,f64,f64,i8,f64,f64,f64,f64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i8,i8,i8,i8,i8,i64
1,1,0,0,6,0,1,0.24,0.81,1,0.0,1.0,-0.781831,0.62349,16,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.05,0.81,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,40
1,1,1,0,6,0,1,0.22,0.8,1,0.258819,0.965926,-0.781831,0.62349,40,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,1.8,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,32
1,1,2,0,6,0,1,0.22,0.8,1,0.5,0.866025,-0.781831,0.62349,32,40,16,1,0,0.22,0.24,0.8,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,2.8,-8,16,1,0,0.0,-0.02,0.22,0.0,-0.01,0,0,0,0,0,13


In [28]:
trainer = ModelTrainer(data=features_df)

2025-10-08 20:55:29 - mlflow_tracker - [INFO] - Set MLflow tracking URI to: http://localhost:5001
2025-10-08 20:55:31 - mlflow_tracker - [INFO] - Created new experiment: bike rental (ID: 1)
2025-10-08 20:55:31 - mlflow_tracker - [INFO] - Set experiment to: bike rental (ID: 1)
2025-10-08 20:55:31 - mlflow_tracker - [INFO] - Initialized MLFlowTracker with experiment: bike rental
Shapes -> x_train: (11260, 47), y_train: (11260,), x_val: (1252, 47), y_val: (1252,), x_test: (1391, 47), y_test: (1391,)
2025-10-08 20:55:31 - trainer - [INFO] - Data preparation complete.


In [None]:
import mlflow

from src.exceptions import CustomError, HyperparameterTuningError, TrainingError

mlflow.end_run()
try:
    # Your ML pipeline code here
    trainer = ModelTrainer(data=features_df)
    trainer.train_all_models()
except TrainingError as e:
    print(f"Training failed: {e}")
    # Fallback to previous model
except HyperparameterTuningError as e:
    print(f"Tuning failed, using defaults: {e}")
except CustomError as e:
    # Catch all custom exceptions
    print(f"Pipeline error: {e}")
    raise  # Re-raise if needed

2025-10-08 21:11:40 - mlflow_tracker - [INFO] - Set MLflow tracking URI to: http://localhost:5001


[autoreload of src.exp_tracking.mlflow failed: Traceback (most recent call last):
  File "/Users/mac/Desktop/Projects/Bike-Rental-Prediction/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 325, in check
    superreload(m, reload, self.old_objects)
  File "/Users/mac/Desktop/Projects/Bike-Rental-Prediction/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 580, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/Users/mac/.local/share/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/importlib/__init__.py", line 131, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 866, in _exec
  File "<frozen importlib._bootstrap_external>", line 999, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/Users/mac/Desktop/Projects/Bike-Rental-Prediction/src/exp_tracking/mlflow.py", line 18, in <module>
    from src.exceptions

2025-10-08 21:11:40 - mlflow_tracker - [INFO] - Set experiment to: bike rental (ID: 1)


INFO:mlflow_tracker:Set experiment to: bike rental (ID: 1)


2025-10-08 21:11:40 - mlflow_tracker - [INFO] - Initialized MLFlowTracker with experiment: bike rental


INFO:mlflow_tracker:Initialized MLFlowTracker with experiment: bike rental


Shapes -> x_train: (11260, 47), y_train: (11260,), x_val: (1252, 47), y_val: (1252,), x_test: (1391, 47), y_test: (1391,)
2025-10-08 21:11:40 - trainer - [INFO] - Data preparation complete.


INFO:trainer:Data preparation complete.


2025-10-08 21:11:40 - trainer - [INFO] - 🚀 Training with default hyperparameters


INFO:trainer:🚀 Training with default hyperparameters


2025-10-08 21:11:40 - mlflow_tracker - [INFO] - Started MLflow run: b211c60188564672a269a00a02f0184a (name: run_2025-10-08T21:11:40)


INFO:mlflow_tracker:Started MLflow run: b211c60188564672a269a00a02f0184a (name: run_2025-10-08T21:11:40)


2025-10-08 21:11:40 - trainer - [INFO] - Training Random Forest ...


INFO:trainer:Training Random Forest ...


2025-10-08 21:11:40 - trainer - [INFO] - Starting Random Forest training with TimeSeriesSplit cross-validation.


INFO:trainer:Starting Random Forest training with TimeSeriesSplit cross-validation.


2025-10-08 21:12:29 - mlflow_tracker - [INFO] - ✅ Successfully logged ModelType.RANDOM_FOREST model and metadata


INFO:mlflow_tracker:✅ Successfully logged ModelType.RANDOM_FOREST model and metadata


2025-10-08 21:12:29 - trainer - [INFO] - 🚀 Random Forest training completed successfully.


INFO:trainer:🚀 Random Forest training completed successfully.


2025-10-08 21:12:29 - trainer - [INFO] - Training XGBoost ...


INFO:trainer:Training XGBoost ...


[0]	train-rmse:142.48170+0.56817	test-rmse:142.63249+2.24840
[1]	train-rmse:129.85876+0.50282	test-rmse:130.08309+2.16342
[2]	train-rmse:118.56982+0.45545	test-rmse:118.91451+2.04287
[3]	train-rmse:108.45974+0.45573	test-rmse:109.01153+1.94537
[4]	train-rmse:99.37948+0.40943	test-rmse:100.10433+1.89918
[5]	train-rmse:91.22467+0.35162	test-rmse:92.11303+1.84627
[6]	train-rmse:83.95013+0.34549	test-rmse:84.98993+1.66377
[7]	train-rmse:77.48305+0.34584	test-rmse:78.72044+1.57090
[8]	train-rmse:71.70569+0.30600	test-rmse:73.11866+1.46897
[9]	train-rmse:66.60278+0.25523	test-rmse:68.19763+1.42540
[10]	train-rmse:61.99538+0.28571	test-rmse:63.72915+1.30815
[11]	train-rmse:57.92426+0.27307	test-rmse:59.82035+1.22944
[12]	train-rmse:54.31092+0.27057	test-rmse:56.39173+1.11397
[13]	train-rmse:51.19342+0.29264	test-rmse:53.45541+1.04063
[14]	train-rmse:48.41713+0.30737	test-rmse:50.87720+0.91537
[15]	train-rmse:45.95426+0.26987	test-rmse:48.59110+0.85173
[16]	train-rmse:43.80155+0.27064	test-rms

INFO:mlflow_tracker:✅ Successfully logged ModelType.XGBOOST model and metadata


2025-10-08 21:12:33 - trainer - [INFO] - 🚀 XGBoost training completed successfully.


INFO:trainer:🚀 XGBoost training completed successfully.


2025-10-08 21:12:33 - trainer - [INFO] - Training LightGBM ...


INFO:trainer:Training LightGBM ...


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 11260, number of used features: 47
[LightGBM] [Info] Start training from score 152.730817
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[86]	training's rmse: 22.903	training's l1: 15.3908	valid_1's rmse: 58.5008	valid_1's l1: 36.5196
2025-10-08 21:12:34 - mlflow_tracker - [INFO] - ✅ Successfully logged ModelType.LIGHTGBM model and metadata


INFO:mlflow_tracker:✅ Successfully logged ModelType.LIGHTGBM model and metadata


2025-10-08 21:12:34 - trainer - [INFO] - 🚀 LightGBM training completed successfully.


INFO:trainer:🚀 LightGBM training completed successfully.


2025-10-08 21:12:34 - trainer - [INFO] - ✅ ALL models training completed successfully.


INFO:trainer:✅ ALL models training completed successfully.


2025-10-08 21:12:34 - visualization - [INFO] - Saved grouped metrics chart to /Users/mac/Desktop/Projects/Bike-Rental-Prediction/reports/model_metrics_comparison_2025-10-08T21:12:34.html


INFO:visualization:Saved grouped metrics chart to /Users/mac/Desktop/Projects/Bike-Rental-Prediction/reports/model_metrics_comparison_2025-10-08T21:12:34.html


2025-10-08 21:12:34 - trainer - [INFO] - ✅ Successfully generated visualizations.


INFO:trainer:✅ Successfully generated visualizations.


🏃 View run run_2025-10-08T21:11:40 at: http://localhost:5001/#/experiments/1/runs/b211c60188564672a269a00a02f0184a
🧪 View experiment at: http://localhost:5001/#/experiments/1
2025-10-08 21:12:36 - mlflow_tracker - [INFO] - Ended MLflow run with status: FINISHED


INFO:mlflow_tracker:Ended MLflow run with status: FINISHED


2025-10-08 21:12:36 - mlflow_tracker - [INFO] - Ended MLflow run with status: FINISHED


INFO:mlflow_tracker:Ended MLflow run with status: FINISHED


2025-10-08 21:12:36 - trainer - [INFO] - Syncing artifacts to S3...


INFO:trainer:Syncing artifacts to S3...




Downloading artifacts:   0%|          | 0/13 [00:00<?, ?it/s]

2025-10-08 21:12:37 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_xgboost.json to S3


INFO:mlflow_s3_utils:Synced models/feat_imp_xgboost.json to S3


2025-10-08 21:12:37 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_random_forest.json to S3


INFO:mlflow_s3_utils:Synced models/feat_imp_random_forest.json to S3


2025-10-08 21:12:37 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_lightgbm.json to S3


INFO:mlflow_s3_utils:Synced models/feat_imp_lightgbm.json to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_metadata.yaml to S3


INFO:mlflow_s3_utils:Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_metadata.yaml to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_model.txt to S3


INFO:mlflow_s3_utils:Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_model.txt to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_input_example.json to S3


INFO:mlflow_s3_utils:Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_input_example.json to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_model.pkl to S3


INFO:mlflow_s3_utils:Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_model.pkl to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_input_example.json to S3


INFO:mlflow_s3_utils:Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_input_example.json to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_metadata.yaml to S3


INFO:mlflow_s3_utils:Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_metadata.yaml to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_metadata.yaml to S3


INFO:mlflow_s3_utils:Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_metadata.yaml to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_model.json to S3


INFO:mlflow_s3_utils:Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_model.json to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_input_example.json to S3


INFO:mlflow_s3_utils:Synced models/ModelType.XGBOOST/ModelType.XGBOOST_2025-10-08T21:12:33_input_example.json to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Synced visualizations/reports/model_metrics_comparison_2025-10-08T21:12:34.html to S3


INFO:mlflow_s3_utils:Synced visualizations/reports/model_metrics_comparison_2025-10-08T21:12:34.html to S3


2025-10-08 21:12:38 - mlflow_s3_utils - [INFO] - Successfully synced all artifacts for run b211c60188564672a269a00a02f0184a to S3


INFO:mlflow_s3_utils:Successfully synced all artifacts for run b211c60188564672a269a00a02f0184a to S3


2025-10-08 21:12:38 - trainer - [INFO] - ✅ Successfully synced artifacts to S3


INFO:trainer:✅ Successfully synced artifacts to S3


2025-10-08 21:12:38 - trainer - [INFO] - Verifying S3 artifact storage...


INFO:trainer:Verifying S3 artifact storage...


2025-10-08 21:12:38 - s3_verification - [INFO] - Found 13 artifacts in S3 for run b211c60188564672a269a00a02f0184a


INFO:s3_verification:Found 13 artifacts in S3 for run b211c60188564672a269a00a02f0184a


2025-10-08 21:12:38 - s3_verification - [INFO] - Artifacts: models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_input_example.json, models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_metadata.yaml, models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_model.txt, models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_input_example.json, models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_metadata.yaml...


INFO:s3_verification:Artifacts: models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_input_example.json, models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_metadata.yaml, models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-08T21:12:34_model.txt, models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_input_example.json, models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-08T21:12:29_metadata.yaml...


2025-10-08 21:12:38 - s3_verification - [INFO] - ✅ S3 artifact verification PASSED


INFO:s3_verification:✅ S3 artifact verification PASSED


2025-10-08 21:12:38 - s3_verification - [INFO] -   - Artifact URI: s3://mlflow-artifacts/1/b211c60188564672a269a00a02f0184a/artifacts


INFO:s3_verification:  - Artifact URI: s3://mlflow-artifacts/1/b211c60188564672a269a00a02f0184a/artifacts


2025-10-08 21:12:38 - s3_verification - [INFO] -   - Total artifacts: 13


INFO:s3_verification:  - Total artifacts: 13


In [None]:
import mlflow

mlflow.end_run()

In [None]:
params = {
    "objective": "reg:squarederror",  # for regression
    "learning_rate": 0.1,
    "max_depth": 6,
}
res_xgb = trainer._train_xgboost(params=params)
# res = trainer._hyperparameter_tuning_xgboost()

In [None]:
res_xgb

In [None]:
from src.ml.utils import (
    create_metrics_df,
    split_into_train_test_sets,
    split_into_train_val_test_sets,
    split_temporal_data,
)

In [None]:
splits_dict: dict[str, Any] = split_into_train_val_test_sets(
    data=features_df, target_col="target", test_size=0.1
)

x_train, y_train = splits_dict["x_train"], splits_dict["y_train"]
x_val, y_val = splits_dict["x_val"], splits_dict["y_val"]
x_test, y_test = splits_dict["x_test"], splits_dict["y_test"]

In [None]:
import lightgbm as lgb

# Create datasets for LightGBM
lgb_train = lgb.Dataset(x_train, y_train)
lgb_val = lgb.Dataset(x_val, y_val, reference=lgb_train)

# Define hyperparameters
params = {
    "objective": "regression",
    "metric": ["rmse", "mae"],
    "learning_rate": 0.1,
    # "num_leaves": 31,
    "early_stopping_rounds": 20,
    "verbose": 1,
}
num_round: int = 100
# Train the model
lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=num_round,
    valid_sets=[lgb_train, lgb_val],
)

# Predict
lgb_preds = lgb_model.predict(x_test, num_iteration=lgb_model.best_iteration)
# Evaluate
lgb_rmse = root_mean_squared_error(y_test, lgb_preds)
print(f"LightGBM RMSE: {lgb_rmse:.3f}")

In [None]:
# res = trainer._train_lightgbm(params=params)

res = trainer.train_all_models()

In [None]:
res

In [None]:
create_metrics_df(res)

In [None]:
dir(lgb_model)

# lgb_model.save_model("lightgbm_model.txt")

# # Load the model
# loaded_lgb_model = lgb.Booster(model_file="lightgbm_model.txt")

lgb_model.feature_importance(), lgb_model.feature_name()
len(lgb_model.feature_name()), len(lgb_model.feature_importance())

In [None]:
# Get feature importance (default is 'split' importance)
feature_importance_split = lgb_model.feature_importance(importance_type="split")
feature_importance_gain = lgb_model.feature_importance(importance_type="gain")

# Get feature names
feature_names = lgb_model.feature_name()
feature_names = splits_dict["columns"]


# Create a comprehensive feature importance analysis
def get_lightgbm_feature_importance(
    model: lgb.Booster, features: list[str]
) -> pl.DataFrame:
    """
    Get feature importance from a LightGBM model.

    Parameters
    ----------
    model : lgb.Booster
        Trained LightGBM model
    features : list[str]
        List of feature names

    Returns
    -------
    pl.DataFrame
        DataFrame containing feature importance scores
    """

    # Get different types of importance
    split_importance = model.feature_importance(importance_type="split")
    gain_importance = model.feature_importance(importance_type="gain")

    # Create DataFrame for easy analysis
    importance_df = pl.DataFrame(
        {
            "feature": features,
            "split_importance": split_importance,
            "gain_importance": gain_importance,
        }
    )

    # Normalize importance scores
    return importance_df.with_columns(
        (pl.col("split_importance") / pl.col("split_importance").sum()).alias(
            "split_importance_normalized"
        ),
        (pl.col("gain_importance") / pl.col("gain_importance").sum()).alias(
            "gain_importance_normalized"
        ),
    )


# Analyze feature importance
importance_results = get_lightgbm_feature_importance(
    model=lgb_model, features=feature_names
)

importance_results

In [None]:
from src.ml.utils import get_model_feature_importance


get_model_feature_importance(
    model_name="LightGBM",
    features=feature_names,
    weights=importance_results["gain_importance_normalized"],
    n=15,
)

In [None]:
params: dict[str, Any] = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42,
}

dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
# Include y_test for evaluation
dval = xgb.DMatrix(x_val, y_val, enable_categorical=True)
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtrain, "train"), (dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=False,
)

# Predict
dtest = xgb.DMatrix(X_test, enable_categorical=True)
xgb_preds = xgb_model.predict(dtest)
# Evaluate
xgb_rmse = root_mean_squared_error(y_test, xgb_preds)
print(f"XGBoost RMSE: {xgb_rmse:.3f}")

In [None]:
# Scikit-learn API
params: dict[str, Any] = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "early_stopping_rounds": 20,
}

xgb_reg = xgb.XGBRegressor(**params)
xgb_reg.fit(X_train, y_train, eval_set=[(x_val, y_val)], verbose=False)

# Evaluate
y_preds = xgb_reg.predict(X_test)
xgb_rmse = root_mean_squared_error(y_test, y_preds)
console.print(f"XGBoost RMSE: {xgb_rmse:.3f}", style="info")

In [None]:
from narwhals.typing import IntoDataFrameT


def split_into_train_val_test_sets(
    features_df: IntoDataFrameT, test_size: float, target_col: str
) -> dict[str, Any]:
    """
    Split a feature dataframe into temporal train, validation, and test sets.

    Parameters
    ----------
    features_df : IntoDataFrameT
        Input feature dataframe containing the target column.
    test_size : float
        Proportion of the data reserved for validation and test splits. Must be in (0, 1).
    target_col : str
        Name of the target column used for supervised learning.

    Returns
    -------
    dict[str, Any]
        Dictionary with feature and target arrays for train, validation, and test sets,
        along with the feature column names.
    """
    if not 0 < test_size < 1:
        raise ValueError("`test_size` must be between 0 and 1 (exclusive).")

    _train_df, test_df = split_temporal_data(data=features_df, test_size=test_size)
    train_df, val_df = split_temporal_data(data=_train_df, test_size=test_size)

    train_val_split = split_into_train_test_sets(
        train_df=train_df, test_df=val_df, target_col=target_col
    )
    train_test_split = split_into_train_test_sets(
        train_df=_train_df, test_df=test_df, target_col=target_col
    )

    X_train, y_train = train_val_split["x_train"], train_val_split["y_train"]
    X_val, y_val = train_val_split["x_test"], train_val_split["y_test"]
    X_test, y_test = train_test_split["x_test"], train_test_split["y_test"]

    if train_val_split["columns"] != train_test_split["columns"]:
        raise ValueError("Feature column mismatch between validation and test splits.")
    if X_train.shape[0] + X_val.shape[0] != train_test_split["x_train"].shape[0]:
        raise ValueError("Train and validation rows do not sum to the expected total.")

    print(
        (
            f"Shapes -> X_train: {X_train.shape}, y_train: {y_train.shape}, "
            f"X_val: {X_val.shape}, y_val: {y_val.shape}, "
            f"X_test: {X_test.shape}, y_test: {y_test.shape}"
        )
    )

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test,
        "y_test": y_test,
        "columns": train_val_split["columns"],
    }

In [None]:
# Test the callback logic
import optuna
from src.ml.trainer import ModelTrainer


def test_champion_callback() -> None:
    """Test the champion callback with mock study data."""

    # Create a mock study for testing
    study = optuna.create_study(direction="minimize")

    # Simulate some trials
    test_values = [100.0, 95.0, 95.5, 90.0, 89.8, 92.0]

    for i, value in enumerate(test_values):
        # Create a mock trial
        trial = study.ask()
        study.tell(trial, value)

        # Get the frozen trial
        frozen_trial = study.trials[-1]

        # Call the callback
        ModelTrainer.champion_callback(study, frozen_trial)

        console.print(f"Trial {i}: Value={value}, Best={study.best_value}")


# Run the test
test_champion_callback()