# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [4]:
from pathlib import Path

import mlflow

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [6]:
rng = np.random.default_rng(123)
x = rng.standard_normal(size=(1_000, 10))

X_train, X_test = train_test_split(x, test_size=0.2, random_state=123)
y_train = rng.standard_normal(size=(X_train.shape[0],))
y_test = rng.standard_normal(size=(X_test.shape[0],))

params: dict[str, Any] = {
    "n_estimators": 100,
    "max_depth": 10,
}

rf_reg = RandomForestRegressor(**params)

rf_reg.fit(X_train, y_train)
# rf_reg.score(X_test, y_test)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
import xgboost as xgb


# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [8]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",  # for regression
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "tree_method": "hist",  # Use 'hist' for CPU, 'gpu_hist' for GPU
}
n: int = 100

# Train the model
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [9]:
preds = model.predict(dtest_reg)
rmse = root_mean_squared_error(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 1.125


In [10]:
# Cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=20,
    nfold=5,
    metrics={"rmse"},
    seed=123,
    as_pandas=True,
    callbacks=[
        xgb.callback.EvaluationMonitor(show_stdv=True),
        xgb.callback.EarlyStopping(rounds=3),
    ],
)
console.print(cv_results)

[0]	train-rmse:0.97861+0.00793	test-rmse:1.00251+0.03045
[1]	train-rmse:0.95530+0.00922	test-rmse:0.99990+0.03146
[2]	train-rmse:0.93566+0.01087	test-rmse:0.99963+0.03280
[3]	train-rmse:0.91714+0.01274	test-rmse:1.00122+0.03374
[4]	train-rmse:0.90072+0.01365	test-rmse:1.00416+0.03115


In [11]:
# Find the optimal number of boosting rounds
best_num_rounds: int = len(cv_results)
console.print(f"Optimal boosting rounds: {best_num_rounds}", style="success")
console.print(cv_results.tail())

# Step 2: Train final model with optimal rounds
final_model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=best_num_rounds,
)

# Step 3: Evaluate on test set
test_preds = final_model.predict(dtest_reg)
test_rmse = root_mean_squared_error(y_test, test_preds)
console.print(f"Test RMSE: {test_rmse:.4f}", style="info")


# Step 4: Save the model
final_model.save_model("xgboost_model.json")

In [12]:
type(final_model).__module__

'xgboost.core'

In [13]:
from src.config.config import app_config
from src.ml.feature_engineering import FeatureEngineer
from src.ml.trainer import ModelTrainer

In [14]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")
display(data.head(2))

trainer = ModelTrainer(data, config=app_config.feature_config)

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40


2025-10-03 21:34:09 - mlflow_tracker - [INFO] - Initialized MLFlowTracker with experiment: bike_rental_experiment
2025-10-03 21:34:09 - trainer - [INFO] - Data preparation complete.


In [15]:
# trainer.data_dict

In [16]:
app_config.feature_config

FeatureConfig(lag_features=[Lags(feature='cnt', lags=[0, 1, 24]), Lags(feature='hr', lags=[1, 24]), Lags(feature='temp', lags=[1, 3]), Lags(feature='hum', lags=[1, 3])], diff_features=[Diffs(feature='cnt', diffs=[1, 2]), Diffs(feature='hr', diffs=[1, 24]), Diffs(feature='temp', diffs=[1, 2, 24]), Diffs(feature='hum', diffs=[1, 2])], interaction_features=[InteractionFeats(feature_1='temp', feature_2='hum', operation='add'), InteractionFeats(feature_1='hum', feature_2='hr', operation='add')], rolling_features=[Windows(feature='temp', windows=[3, 6]), Windows(feature='hum', windows=[3, 6])], drop_features=['atemp', 'windspeed', 'casual', 'registered', 'datetime', 'cnt', 'yr'], target_col='cnt')

In [17]:
# data_dict: dict[str, Any] = trainer.prepare_data()

trainer._hyperparameter_tuning_random_forest()

2025-10-03 21:34:14 - mlflow_tracker - [INFO] - Started MLflow run: 84cb99a26f5b4318a5e2229cc69d306b (name: run_2025-10-03T21:34:13)


[I 2025-10-03 21:34:14,078] A new study created in memory with name: no-name-7169ee88-fe0f-4db7-a81c-4420f9fdda90


2025-10-03 21:34:14 - mlflow_tracker - [INFO] - Started MLflow run: b967688ff1c4482fa6ea9cf63cbdbcec (name: run_2025-10-03T21:34:14)
Trial 0: Mean RMSE = 60.67
🏃 View run run_2025-10-03T21:34:14 at: http://0.0.0.0:6060/#/experiments/941431890320196348/runs/b967688ff1c4482fa6ea9cf63cbdbcec
🧪 View experiment at: http://0.0.0.0:6060/#/experiments/941431890320196348
2025-10-03 21:35:26 - mlflow_tracker - [INFO] - Ended MLflow run with status: FINISHED


[I 2025-10-03 21:35:26,957] Trial 0 finished with value: 60.67 and parameters: {'n_estimators': 144, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 60.67.


2025-10-03 21:35:26 - trainer - [INFO] - Initial trial 0 achieved value: 60.67
Best trial:
2025-10-03 21:35:27 - mlflow_tracker - [INFO] - ✅ Successfully logged RandomForestRegressor model and metadata
🏃 View run run_2025-10-03T21:34:13 at: http://0.0.0.0:6060/#/experiments/941431890320196348/runs/84cb99a26f5b4318a5e2229cc69d306b
🧪 View experiment at: http://0.0.0.0:6060/#/experiments/941431890320196348
2025-10-03 21:35:27 - mlflow_tracker - [INFO] - Ended MLflow run with status: FINISHED


In [None]:
from src.exp_tracking.mlflow import MLFlowTracker

In [None]:
# port: int = 6060
# url: str = f"http://0.0.0.0:{port}"

# mlflow.set_tracking_uri(url)
# experiment_name: str = "Demo Experiment"

In [None]:
from enum import Enum
from typing import Callable

type WriteFn = Callable[[Any, Path], None]


class ArtifactsType(str, Enum):
    JSON = "json"
    TXT = "txt"
    YAML = "yaml"
    ANY = "joblib"

    def __str__(self) -> str:
        return str(self.value)


port: int = 6060
url: str = f"http://0.0.0.0:{port}"
experiment_name: str = "Demo Experiment"

mlflow_tracker = MLFlowTracker(experiment_name=experiment_name, tracking_uri=url)
mlflow_tracker

In [None]:
with mlflow_tracker.start_run() as run:
    _ = mlflow_tracker.start_run(tags={"mlflow.runName": "Initial model training"})
    # Log the model
    mlflow_tracker.log_model(
        model_name="RandomForestRegressor",
        model=rf_reg,
        input_example=pl.from_numpy(X_test),
    )
    # mlflow.sklearn.log_model(sk_model=rf_reg, input_example=X_test, name="rf_reg")

    # Log the metrics
    mlflow_tracker.log_metrics(
        {"mse": 45.0, "rmse": 5.0, "mae": 4.0, "r2": 0.8, "msle": 0.1, "medae": 3.0}
    )

    # Log the hyperparameter
    mlflow_tracker.log_params(params=params)

    # Log plots
    # mlflow.log_figure(fig1, "time_series_demand.png")
    # mlflow.log_figure(fig2, "box_weekend.png")

    # Log artifacts saved in the local file system
    # mlflow_tracker.log_mlflow_artifact()
    # log_mlflow_artifact("./plots/correlation_wf_target.png", artifact_dest="plots")

    mlflow_tracker.log_mlflow_artifact(
        object={"name": "wuraola", "role": "medical doctor", "age": 29},
        object_type=ArtifactsType.YAML,
        filename="my_metadata",
        artifact_dest=None,
    )

In [None]:
from omegaconf import DictConfig, OmegaConf

from src.config.config import app_config

fp: str = "../src/config/config.yaml"


config: DictConfig = OmegaConf.load(fp).config

cfg = OmegaConf.to_container(config, resolve=True)
console.print(cfg)
app_config

In [None]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

feat_eng = FeatureEngineer(data=data, config=app_config.feature_config)
console.print(feat_eng)

feat_eng.create_all_features()

In [None]:
import json
import tempfile
from datetime import datetime

import joblib
import yaml


def log_mlflow_artifact_v1(local_path: str, artifact_dest: str | None = None) -> None:
    """
    Log a local file to MLflow.

    Parameters
    ----------
    local_path : str
        Path to the local file to log.
    artifact_dest : str | None
        (Optional) Run-relative directory in the MLflow artifact store.
    """
    if not isinstance(local_path, Path):
        file_path = Path(local_path)
    if not file_path.is_file():
        raise FileNotFoundError(f"Cannot find artifact at {local_path}")
    # Log it under artifact_dest (or root if None)
    mlflow.log_artifact(str(file_path), artifact_path=artifact_dest)


def _get_run_name(run_name: str | None = None) -> str:
    if run_name is None:
        run_name = f"run_{datetime.now().isoformat(timespec='seconds')}"
    return run_name

In [None]:
with tempfile.TemporaryDirectory() as tmpdir:
    my_path = Path(tmpdir, "my_file.json")
    print(my_path)
    with open(my_path, "w") as f:
        json.dump({"name": "wuraola", "role": "medical doctor"}, fp=f, indent=2)

In [None]:
from enum import Enum

type WriteFn = Callable[[Any, Path], None]


class ArtifactsType(str, Enum):
    JSON = "json"
    TXT = "txt"
    YAML = "yaml"
    ANY = "joblib"

    def __str__(self) -> str:
        return str(self.value)


def write_json(object: dict[str, Any] | Any, filepath: Path, indent: int = 2) -> None:
    with open(filepath, "w") as f:
        json.dump(object, fp=f, indent=indent)


def write_txt(object: list[Any], filepath: Path) -> None:
    with open(filepath, "w") as f:
        for line in object:
            f.write(line + "\n")


def write_yaml(object: dict[str, Any] | Any, filepath: Path) -> None:
    with open(filepath, "w") as f:
        yaml.dump(object, f)


def write_pickle(object: dict[str, Any] | Any, filepath: Path) -> None:
    joblib.dump(object, filepath)


def log_mlflow_artifact(
    object: Any,
    object_type: ArtifactsType,
    filename: str,
    artifact_dest: str | None = None,
) -> None:
    """
    Log a local file to MLflow.

    Parameters
    ----------
    local_path : str
        Path to the local file to log.
    artifact_dest : str | None
        (Optional) Run-relative directory in the MLflow artifact store.
    """
    if object_type == ArtifactsType.JSON:
        write_fn = write_json
    elif object_type == ArtifactsType.TXT:
        write_fn = write_txt
    elif object_type == ArtifactsType.YAML:
        write_fn = write_yaml
    elif object_type == ArtifactsType.ANY:
        write_fn = write_pickle
    else:
        raise ValueError(f"Unsupported object type: {object_type}")

    with tempfile.TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir) / f"{filename}-artifact.{object_type}"
        write_fn(object, tmp_path)
        mlflow.log_artifact(tmp_path, artifact_path=artifact_dest)

In [None]:
# mlflow.end_run()

# mlflow.set_experiment(experiment_name)
# with mlflow.start_run(run_name=_get_run_name()) as run:
#     # Log the model
#     mlflow.sklearn.log_model(sk_model=rf_reg, input_example=X_test, name="rf_reg")

#     # Log the metrics
#     mlflow.log_metrics(
#         {"mse": 45.0, "rmse": 5.0, "mae": 4.0, "r2": 0.8, "msle": 0.1, "medae": 3.0}
#     )

#     # Log the hyperparameter
#     mlflow.log_params(params=params)

#     # Log plots
#     # mlflow.log_figure(fig1, "time_series_demand.png")
#     # mlflow.log_figure(fig2, "box_weekend.png")

#     # Log artifacts saved in the local file system
#     log_mlflow_artifact("./plots/correlation_wf_target.png", artifact_dest="plots")
#     # log_mlflow_artifact("./my_metadata.json", artifact_dest="metadata")
#     log_mlflow_artifact(
#         object={"name": "wuraola", "role": "medical doctor"},
#         object_type=ArtifactsType.YAML,
#         filename="my_metadata",
#         artifact_dest=None,
#     )
# #

# Dataset Preparation

### Load data

In [None]:
# fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
# data: pl.DataFrame = pl.read_parquet(fp)
# console.print(f"Shape: {data.shape}", style="info")

# data.head()