# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [6]:
rng = np.random.default_rng(123)
x = rng.standard_normal(size=(1_000, 10))

X_train, X_test = train_test_split(x, test_size=0.2, random_state=123)
y_train = rng.standard_normal(size=(X_train.shape[0],))
y_test = rng.standard_normal(size=(X_test.shape[0],))

params: dict[str, Any] = {
    "n_estimators": 100,
    "max_depth": 10,
}

rf_reg = RandomForestRegressor(**params)

rf_reg.fit(X_train, y_train)
# rf_reg.score(X_test, y_test)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
rf_reg.feature_importances_

array([0.1128, 0.1059, 0.0788, 0.1145, 0.0952, 0.1121, 0.0879, 0.1043,
       0.0889, 0.0996])

In [8]:
import xgboost as xgb


# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [9]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",  # for regression
    "eval_metric": ["rmse", "mae", "mape"],
    "learning_rate": 0.1,
    "max_depth": 6,
    "tree_method": "hist",  # Use 'hist' for CPU, 'gpu_hist' for GPU
}
n: int = 100

# Train the model
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [10]:
preds = model.predict(dtest_reg)
rmse = root_mean_squared_error(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 1.125


In [11]:
# Cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=20,
    nfold=5,
    metrics={"rmse"},
    seed=123,
    as_pandas=True,
    callbacks=[
        xgb.callback.EvaluationMonitor(show_stdv=True),
        xgb.callback.EarlyStopping(rounds=3),
    ],
)
console.print(cv_results)

[0]	train-rmse:0.97861+0.00793	test-rmse:1.00251+0.03045
[1]	train-rmse:0.95530+0.00922	test-rmse:0.99990+0.03146
[2]	train-rmse:0.93566+0.01087	test-rmse:0.99963+0.03280
[3]	train-rmse:0.91714+0.01274	test-rmse:1.00122+0.03374
[4]	train-rmse:0.90072+0.01365	test-rmse:1.00416+0.03115
[5]	train-rmse:0.88468+0.01652	test-rmse:1.00457+0.03222


In [12]:
# Find the optimal number of boosting rounds
best_num_rounds: int = len(cv_results)
console.print(f"Optimal boosting rounds: {best_num_rounds}", style="success")
console.print(cv_results.tail())

# Step 2: Train final model with optimal rounds
final_model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=best_num_rounds,
)

# Step 3: Evaluate on test set
test_preds = final_model.predict(dtest_reg)
test_rmse = root_mean_squared_error(y_test, test_preds)
console.print(f"Test RMSE: {test_rmse:.4f}", style="info")


# Step 4: Save the model
final_model.save_model("xgboost_model.json")

In [13]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.978613,0.007929,1.002512,0.030454
1,0.955297,0.009222,0.999901,0.031463
2,0.935665,0.010871,0.99963,0.032802


In [14]:
type(final_model).__module__

'xgboost.core'

In [15]:
import httpx

try:
    async with httpx.AsyncClient() as client:
        r = await client.get("http://0.0.0.0:6060/#/experiments/list", timeout=10.0)

    if r.status_code == 200:
        console.print("MLflow Tracking Server is reachable!", style="success")

except Exception as e:
    console.print(f"Error: {e}", style="error")

In [16]:
from src.config.config import app_config
from src.ml.trainer import ModelTrainer

In [17]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")
display(data.head(2))

trainer = ModelTrainer(data, config=app_config.feature_config)

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40


2025-10-04 15:17:52 - mlflow_tracker - [INFO] - Initialized MLFlowTracker with experiment: bike_rental
2025-10-04 15:17:52 - trainer - [INFO] - Data preparation complete.


In [28]:
trainer.train_models()

2025-10-04 15:43:56 - mlflow_tracker - [INFO] - Started MLflow run: 62adb61c37f148f2a2937c5a6ffaca1b (name: run_2025-10-04T15:43:56)
2025-10-04 15:43:56 - trainer - [INFO] - Starting Random Forest training with TimeSeriesSplit cross-validation.
2025-10-04 15:44:36 - trainer - [INFO] - Random Forest training completed successfully.
[0]	train-rmse:129.83396+0.49329	test-rmse:129.94107+2.02921
[1]	train-rmse:118.36803+0.41461	test-rmse:118.63513+1.90428
[2]	train-rmse:108.10708+0.34674	test-rmse:108.56962+1.72632
[3]	train-rmse:98.96398+0.29467	test-rmse:99.57121+1.53110
[4]	train-rmse:90.73389+0.23459	test-rmse:91.46168+1.35992
[5]	train-rmse:83.29393+0.22958	test-rmse:84.20065+1.18060
[6]	train-rmse:76.79318+0.18376	test-rmse:77.86377+1.02894
[7]	train-rmse:71.01306+0.13615	test-rmse:72.26725+0.95731
[8]	train-rmse:65.85531+0.09461	test-rmse:67.26429+0.88410
[9]	train-rmse:61.08189+0.22882	test-rmse:62.69357+0.67034
[10]	train-rmse:57.02937+0.22396	test-rmse:58.89221+0.58305
[11]	train-

In [19]:
xgb_result = trainer._train_xgboost()

[0]	train-rmse:129.83396+0.49329	test-rmse:129.94107+2.02921
[1]	train-rmse:118.36803+0.41461	test-rmse:118.63513+1.90428
[2]	train-rmse:108.10708+0.34674	test-rmse:108.56962+1.72632
[3]	train-rmse:98.96398+0.29467	test-rmse:99.57121+1.53110
[4]	train-rmse:90.73389+0.23459	test-rmse:91.46168+1.35992
[5]	train-rmse:83.29393+0.22958	test-rmse:84.20065+1.18060
[6]	train-rmse:76.79318+0.18376	test-rmse:77.86377+1.02894
[7]	train-rmse:71.01306+0.13615	test-rmse:72.26725+0.95731
[8]	train-rmse:65.85531+0.09461	test-rmse:67.26429+0.88410
[9]	train-rmse:61.08189+0.22882	test-rmse:62.69357+0.67034
[10]	train-rmse:57.02937+0.22396	test-rmse:58.89221+0.58305
[11]	train-rmse:53.40404+0.24720	test-rmse:55.45096+0.51982
[12]	train-rmse:50.07942+0.30721	test-rmse:52.32628+0.62545
[13]	train-rmse:47.20141+0.27382	test-rmse:49.67967+0.64396
[14]	train-rmse:44.66045+0.30742	test-rmse:47.38132+0.65134
[15]	train-rmse:42.41180+0.21813	test-rmse:45.30536+0.65606
[16]	train-rmse:40.36744+0.35379	test-rmse:4

In [21]:
booster_params = xgb_result["model"].save_config()
console.print(booster_params, style="info")

In [27]:
import json

json.loads(xgb_result["model"].save_config())

{'learner': {'generic_param': {'device': 'cpu',
   'fail_on_invalid_gpu_id': '0',
   'n_jobs': '0',
   'nthread': '0',
   'random_state': '42',
   'seed': '42',
   'seed_per_iteration': '0',
   'validate_parameters': '1'},
  'gradient_booster': {'gbtree_model_param': {'num_parallel_tree': '1',
    'num_trees': '200'},
   'gbtree_train_param': {'process_type': 'default',
    'tree_method': 'auto',
    'updater': 'grow_quantile_histmaker',
    'updater_seq': 'grow_quantile_histmaker'},
   'name': 'gbtree',
   'specified_updater': False,
   'tree_train_param': {'alpha': '0',
    'cache_opt': '1',
    'colsample_bylevel': '1',
    'colsample_bynode': '1',
    'colsample_bytree': '1',
    'eta': '0.100000001',
    'gamma': '0',
    'grow_policy': 'depthwise',
    'interaction_constraints': '',
    'lambda': '1',
    'learning_rate': '0.100000001',
    'max_bin': '256',
    'max_cat_threshold': '64',
    'max_cat_to_onehot': '4',
    'max_delta_step': '0',
    'max_depth': '6',
    'max_leav

In [23]:
rf_reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 10,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}