# Lab For Experimentation

In [1]:
import json
import warnings
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]
json.loads('{"name": "Bike Rental Prediction", "category": "A"}')

{'name': 'Bike Rental Prediction', 'category': 'A'}

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [5]:
rng = np.random.default_rng(123)
x = rng.standard_normal(size=(1_000, 10))

X_train, X_test = train_test_split(x, test_size=0.2, random_state=123)
y_train = rng.standard_normal(size=(X_train.shape[0],))
y_test = rng.standard_normal(size=(X_test.shape[0],))

params: dict[str, Any] = {
    "n_estimators": 100,
    "max_depth": 10,
}

rf_reg = RandomForestRegressor(**params)

rf_reg.fit(X_train, y_train)
# rf_reg.score(X_test, y_test)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
rf_reg.feature_importances_

array([0.1131, 0.1023, 0.0747, 0.1107, 0.0949, 0.1186, 0.0979, 0.1002,
       0.0955, 0.0921])

In [7]:
import xgboost as xgb

In [8]:
from src.config import app_config
from src.exceptions import (
    CustomError,
    MLFlowConnectionError,
)
from src.ml.feature_engineering import FeatureEngineer
from src.ml.trainer import ModelTrainer



In [9]:
import httpx

port: int = 5001
url: str = f"http://localhost:{port}"


def check_mlflow(url: str, timeout: float = 2.0) -> bool:
    """
    Check MLflow endpoint accessibility and handle common httpx errors.

    Returns True if reachable (2xx), False otherwise.
    """
    try:
        resp = httpx.get(url, timeout=timeout)
        resp.raise_for_status()
        console.print("[success]MLflow is accessible")
        return True

    except httpx.HTTPStatusError as e:
        console.print(
            f"[error]MLflow returned non-2xx status: {e.response.status_code} — {e}"
        )
        return False

    except httpx.RequestError as e:
        # covers ConnectError, ReadTimeout, etc.
        console.print(f"[error]Network/connection error when contacting MLflow: {e}")
        return False

    except (MLFlowConnectionError, CustomError) as e:
        console.print(f"[error]Project-specific MLflow error: {e}")
        return False

    except Exception as e:
        console.print(f"[error]Unexpected error: {e}")
        return False


check_mlflow(url)

True

In [10]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

feat_eng = FeatureEngineer()
features_df: pl.DataFrame = feat_eng.create_all_features(
    data=data, config=app_config.feature_config
)
features_df.head()

season,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,is_weekend,sin_hour,cos_hour,sin_weekday,cos_weekday,cnt_lag_0hr,cnt_lag_1hr,cnt_lag_24hr,hr_lag_1hr,hr_lag_24hr,temp_lag_1hr,temp_lag_3hr,hum_lag_1hr,hum_lag_3hr,temp_rolling_mean_3hr,temp_rolling_median_3hr,temp_rolling_mean_6hr,temp_rolling_median_6hr,hum_rolling_mean_3hr,hum_rolling_median_3hr,hum_rolling_mean_6hr,hum_rolling_median_6hr,temp_plus_hum,hum_plus_hr,cnt_diff_1hr,cnt_diff_2hr,hr_diff_1hr,hr_diff_24hr,temp_diff_1hr,temp_diff_2hr,temp_diff_24hr,hum_diff_1hr,hum_diff_2hr,is_high_temp,is_high_hum,is_peak_hour,is_working_hour,is_business_hour,target
i64,i64,i64,i64,i64,i64,i64,f64,f64,i8,f64,f64,f64,f64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i8,i8,i8,i8,i8,i64
1,1,0,0,6,0,1,0.24,0.81,1,0.0,1.0,-0.781831,0.62349,16,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.05,0.81,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,40
1,1,1,0,6,0,1,0.22,0.8,1,0.258819,0.965926,-0.781831,0.62349,40,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,1.8,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,32
1,1,2,0,6,0,1,0.22,0.8,1,0.5,0.866025,-0.781831,0.62349,32,40,16,1,0,0.22,0.24,0.8,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,2.8,-8,16,1,0,0.0,-0.02,0.22,0.0,-0.01,0,0,0,0,0,13
1,1,3,0,6,0,1,0.24,0.75,1,0.707107,0.707107,-0.781831,0.62349,13,32,16,2,0,0.22,0.24,0.8,0.81,0.226667,0.22,0.233333,0.24,0.783333,0.8,0.776667,0.775,0.99,3.75,-19,-27,1,0,0.02,0.02,0.22,-0.05,-0.05,0,0,0,0,0,1
1,1,4,0,6,0,1,0.24,0.75,1,0.866025,0.5,-0.781831,0.62349,1,13,16,3,0,0.24,0.22,0.75,0.8,0.233333,0.24,0.233333,0.24,0.766667,0.75,0.776667,0.775,0.99,4.75,-12,-31,1,0,0.0,0.02,0.22,0.0,-0.05,0,0,0,0,0,1


In [11]:
from src.ml.utils import split_into_train_val_test_sets
from src.schemas.types import ModelType

trainer = ModelTrainer(data=features_df)

2025-10-12 18:35:14 - mlflow_tracker - [INFO] - Set MLflow tracking URI to: http://localhost:5001
2025-10-12 18:35:15 - mlflow_tracker - [INFO] - Set experiment to: bike rental (ID: 1)
2025-10-12 18:35:15 - mlflow_tracker - [INFO] - Initialized MLFlowTracker with experiment: bike rental
Shapes -> x_train: (11260, 47), y_train: (11260,), x_val: (1252, 47), y_val: (1252,), x_test: (1391, 47), y_test: (1391,)
2025-10-12 18:35:15 - trainer - [INFO] - Data preparation complete.


In [17]:
params: dict[str, Any] = {
    "objective": "reg:squarederror",
    "learning_rate": 0.1,
    "max_depth": 6,
}
data_dict = split_into_train_val_test_sets(
    data=features_df, target_col="target", test_size=0.2
)
x_train, y_train = data_dict["x_train"], data_dict["y_train"]
x_val, y_val = data_dict["x_val"], data_dict["y_val"]
dtrain = xgb.DMatrix(x_train, y_train, enable_categorical=True)
dval = xgb.DMatrix(x_val, y_val, enable_categorical=True)

# cv = xgb.cv(
#     params=params,
#     dtrain=dtrain,
#     num_boost_round=500,
#     nfold=5,
#     metrics={"rmse"},
#     early_stopping_rounds=10,
#     seed=123,
# )


# trainer._train_lightgbm(params={})
trainer.train_all_models()

Shapes -> x_train: (8897, 47), y_train: (8897,), x_val: (2225, 47), y_val: (2225,), x_test: (2781, 47), y_test: (2781,)
2025-10-12 18:44:59 - trainer - [INFO] - 🚀 Training with default hyperparameters
2025-10-12 18:44:59 - mlflow_tracker - [INFO] - Started MLflow run: 48ee16173ab447cc9c96e99ec0d7b8ef (name: run_2025-10-12T18:44:59)
2025-10-12 18:44:59 - trainer - [INFO] - Training Random Forest ...
2025-10-12 18:44:59 - trainer - [INFO] - Starting Random Forest training with TimeSeriesSplit cross-validation.
2025-10-12 18:45:34 - mlflow_tracker - [INFO] - ✅ Successfully logged ModelType.RANDOM_FOREST model and metadata
2025-10-12 18:45:34 - trainer - [INFO] - 🚀 Random Forest training completed successfully.
2025-10-12 18:45:34 - trainer - [INFO] - Training XGBoost ...
[0]	validation-rmse:224.50587
[10]	validation-rmse:115.61256
[20]	validation-rmse:83.71050
[30]	validation-rmse:72.53542
[40]	validation-rmse:67.24639
[50]	validation-rmse:64.51730
[60]	validation-rmse:63.21191
[70]	valid

Downloading artifacts:   0%|          | 0/13 [00:00<?, ?it/s]

2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_xgboost.json to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_random_forest.json to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/feat_imp_lightgbm.json to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-12T18:45:36_model.txt to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-12T18:45:36_metadata.yaml to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/ModelType.LIGHTGBM/ModelType.LIGHTGBM_2025-10-12T18:45:36_input_example.json to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-12T18:45:34_model.pkl to S3
2025-10-12 18:45:39 - mlflow_s3_utils - [INFO] - Synced models/ModelType.RANDOM_FOREST/ModelType.RANDOM_FOREST_2025-10-12T18:45:34_input_example.json

[TrainingResult(run_id='48ee16173ab447cc9c96e99ec0d7b8ef', model_name=<ModelType.RANDOM_FOREST: 'RandomForestRegressor'>, trained_model=RandomForestRegressor(max_depth=10, random_state=42), metrics={'RMSE': 50.76, 'MAE': 33.22, 'MAPE': 19.52, 'Adjusted_R2': 0.93}, predictions=[466.8060985983827, 475.0832171957173, 416.0327359553692, 476.7064927369419, 479.8867722249119, 377.90637890315617, 285.8485683100833, 261.85789614606904, 209.2862240343101, 101.52391300857943, 66.63866127251349, 17.80584101903513, 12.742919040191612, 7.270005866968362, 5.01031433859392, 23.180805100793787, 125.9030927564271, 404.44944007663366, 577.4083125553783, 298.6140310516323, 194.43423685102192, 182.99300061913524, 247.61368420967636, 264.86039975661356, 242.72175639998758, 278.4274167323012, 284.02164919957386, 658.5575863636084, 743.1920713836817, 551.3845303030303, 445.9395571902294, 292.6258770521684, 192.498408794636, 108.00564310634752, 40.83152814832559, 13.785506845925324, 8.518942020977427, 4.90239

In [None]:
# trainer.hyperparameter_tuning_all_models()

In [None]:
result = trainer._hyperparameter_tuning_xgboost()
# result = trainer._hyperparameter_tuning_lightgbm()

result

In [14]:
import mlflow

mlflow.end_run()

🏃 View run run_2025-10-12T18:35:15 at: http://localhost:5001/#/experiments/1/runs/1b9e993faf1d40bbb36fd2da78980fd2
🧪 View experiment at: http://localhost:5001/#/experiments/1


trainer.mlflow_tracker.load_model_artifact(
    run_id=result["run_id"],
    model_name=ModelType.XGBOOST.value,
    artifact_subpath="models",
)

In [None]:
from src.exp_tracking.model_loader import (
    load_best_model,
    load_model_from_run,
)

In [None]:
loaded_artifacts = load_model_from_run(
    run_id=result["run_id"], model_name=ModelType.XGBOOST
)

In [None]:
loaded = load_best_model(experiment_name=app_config.experiment_config.experiment_name)
# loaded

In [None]:
console.print(loaded)