# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [20]:
from pathlib import Path

import mlflow

In [None]:
# def get_or_create_experiment(experiment_name: str) -> str:
#     """
#     Retrieve an MLflow experiment ID for a named experiment.

#     Parameters
#     ----------
#     experiment_name : str
#         Name of the MLflow experiment.

#     Returns
#     -------
#     str
#         Identifier of the retrieved or newly created MLflow experiment.
#     """
#     if experiment := mlflow.get_experiment_by_name(experiment_name):
#         return experiment.experiment_id
#     return mlflow.create_experiment(experiment_name)


# def log_mlflow_artifact(local_path: str, artifact_dest: str | None = None) -> None:
#     """
#     Log a local file as an MLflow artifact.

#     Parameters
#     ----------
#     local_path : str
#         Path to the local file to log.
#     artifact_dest : str | None, optional
#         Run-relative directory within the MLflow artifact store.

#     Raises
#     ------
#     FileNotFoundError
#         If the provided local path does not exist or is not a file.
#     """
#     file_path = Path(local_path)
#     if not file_path.is_file():
#         raise FileNotFoundError(f"Cannot find artifact at {local_path}")
#     mlflow.log_artifact(str(file_path), artifact_path=artifact_dest)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [7]:
rng = np.random.default_rng(123)
x = rng.standard_normal(size=(1_000, 10))

X_train, X_test = train_test_split(x, test_size=0.2, random_state=123)
y_train = rng.standard_normal(size=(X_train.shape[0],))
y_test = rng.standard_normal(size=(X_test.shape[0],))

params: dict[str, Any] = {
    "n_estimators": 100,
    "max_depth": 10,
}

rf_reg = RandomForestRegressor(**params)

rf_reg.fit(X_train, y_train)
# rf_reg.score(X_test, y_test)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
port: int = 6060
url: str = f"http://0.0.0.0:{port}"

mlflow.set_tracking_uri(url)

In [74]:
experiment_name: str = "Demo Experiment"

In [72]:
import json
import tempfile
from datetime import datetime

import joblib
import mlflow
import yaml


def log_mlflow_artifact_v1(local_path: str, artifact_dest: str | None = None) -> None:
    """
    Log a local file to MLflow.

    Parameters
    ----------
    local_path : str
        Path to the local file to log.
    artifact_dest : str | None
        (Optional) Run-relative directory in the MLflow artifact store.
    """
    if not isinstance(local_path, Path):
        file_path = Path(local_path)
    if not file_path.is_file():
        raise FileNotFoundError(f"Cannot find artifact at {local_path}")
    # Log it under artifact_dest (or root if None)
    mlflow.log_artifact(str(file_path), artifact_path=artifact_dest)


def _get_run_name(run_name: str | None = None) -> str:
    if run_name is None:
        run_name = f"run_{datetime.now().isoformat(timespec='seconds')}"
    return run_name

In [25]:
with tempfile.TemporaryDirectory() as tmpdir:
    my_path = Path(tmpdir, "my_file.json")
    print(my_path)
    with open(my_path, "w") as f:
        json.dump({"name": "wuraola", "role": "medical doctor"}, fp=f, indent=2)

/var/folders/vv/g_5scsqs6fj18dr1q_bww19r0000gn/T/tmpzxyrleyl/my_file.json


In [73]:
from enum import Enum
from typing import Callable

type WriteFn = Callable[[Any, Path], None]


class ArtifactsType(str, Enum):
    JSON = "json"
    TXT = "txt"
    YAML = "yaml"
    ANY = "joblib"

    def __str__(self) -> str:
        return str(self.value)


def write_json(object: dict[str, Any] | Any, filepath: Path, indent: int = 2) -> None:
    with open(filepath, "w") as f:
        json.dump(object, fp=f, indent=indent)


def write_txt(object: list[Any], filepath: Path) -> None:
    with open(filepath, "w") as f:
        for line in object:
            f.write(line + "\n")


def write_yaml(object: dict[str, Any] | Any, filepath: Path) -> None:
    with open(filepath, "w") as f:
        yaml.dump(object, f)


def write_pickle(object: dict[str, Any] | Any, filepath: Path) -> None:
    joblib.dump(object, filepath)


def log_mlflow_artifact(
    object: Any,
    object_type: ArtifactsType,
    filename: str,
    artifact_dest: str | None = None,
) -> None:
    """
    Log a local file to MLflow.

    Parameters
    ----------
    local_path : str
        Path to the local file to log.
    artifact_dest : str | None
        (Optional) Run-relative directory in the MLflow artifact store.
    """
    if object_type == ArtifactsType.JSON:
        write_fn = write_json
    elif object_type == ArtifactsType.TXT:
        write_fn = write_txt
    elif object_type == ArtifactsType.YAML:
        write_fn = write_yaml
    elif object_type == ArtifactsType.ANY:
        write_fn = write_pickle
    else:
        raise ValueError(f"Unsupported object type: {object_type}")

    with tempfile.TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir) / f"{filename}-artifact.{object_type}"
        write_fn(object, tmp_path)
        mlflow.log_artifact(tmp_path, artifact_path=artifact_dest)

In [75]:
log_mlflow_artifact(
    object={"name": "wuraola", "role": "medical doctor"},
    object_type=ArtifactsType.JSON,
    filename="my_metadata",
    artifact_dest=None,
)

In [40]:
dt_l: list[str] = ["It is good to be good, '25000'", "thank you Lord, '45294'"]
dt_l: dict[str, Any] = {"name": "my name"}
write_pickle(dt_l, "file.joblib")

In [None]:
mlflow.end_run()

mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=_get_run_name()) as run:
    # Log the model
    mlflow.sklearn.log_model(sk_model=rf_reg, input_example=X_test, name="rf_reg")

    # Log the metrics
    mlflow.log_metrics(
        {"mse": 45.0, "rmse": 5.0, "mae": 4.0, "r2": 0.8, "msle": 0.1, "medae": 3.0}
    )

    # Log the hyperparameter
    mlflow.log_params(params=params)

    # Log plots
    # mlflow.log_figure(fig1, "time_series_demand.png")
    # mlflow.log_figure(fig2, "box_weekend.png")

    # Log artifacts saved in the local file system
    log_mlflow_artifact("./plots/correlation_wf_target.png", artifact_dest="plots")
    # log_mlflow_artifact("./my_metadata.json", artifact_dest="metadata")
    log_mlflow_artifact(
        object={"name": "wuraola", "role": "medical doctor"},
        object_type=ArtifactsType.YAML,
        filename="my_metadata",
        artifact_dest=None,
    )
#



🏃 View run run_2025-10-02T13:08:43 at: http://0.0.0.0:6060/#/experiments/767392527681543348/runs/a84f7d5da017465fa64e425604bd17e1
🧪 View experiment at: http://0.0.0.0:6060/#/experiments/767392527681543348


# Dataset Preparation

### Load data

In [None]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

data.head()

### Validate Data

In [None]:
from src.ml.feature_engineering import (
    FeatureConfig,
    FeatureEngineer,
    InteractionFeats,
    Lags,
    Windows,
    create_lag_features,
)

# train_data, test_data = split_temporal_data(data, test_size=0.2)
# train_data.shape, test_data.shape

create_lag_features(
    nw.from_native(data.to_pandas()), target_col="cnt", lags=[1, 2, 3]
).head()  # .to_native()

In [None]:
config: FeatureConfig = FeatureConfig(
    lags=[Lags(feature="cnt", lags=[1, 2, 3]), Lags(feature="temp", lags=[1, 2, 3])],
    diffs=[Lags(feature="cnt", lags=[1]), Lags(feature="temp", lags=[1])],
    interactions=[
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="add"),
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="multiply"),
    ],
    rolling_windows=[
        Windows(feature="cnt", windows=[3, 7]),
        Windows(feature="temp", windows=[3, 7]),
    ],
    drop_feats=["atemp", "windspeed", "cnt"],
    target_col="cnt",
)

# # Lags
# for lag in config.lags:
#     df = create_lag_features(
#         nw.from_native(data.to_pandas()), target_col=lag.feature, lags=lag.lags
#     )
# # Diffs
# for diff in config.diffs:
#     df = create_difference_features(
#         nw.from_native(df.to_pandas()), target_col=diff.feature, lags=diff.lags
#     )
# # Interactions
# for interaction in config.interactions:
#     df = create_interaction_features(
#         nw.from_native(df.to_pandas()),
#         feature_1=interaction.feature_1,
#         feature_2=interaction.feature_2,
#         operation=interaction.operation,
#     )
# df.head()

In [None]:
feat_eng = FeatureEngineer(data, config)

feat_eng.create_all_features().null_count().sum_horizontal()

In [None]:
df = pl.DataFrame({"age": [25, 30, 35], "salary": [50000, 60000, 70000]})

df = df.with_columns(pl.col("age").shift(0).alias("age_shift_0"))

In [None]:
config: FeatureConfig = FeatureConfig(
    lags=[
        Lags(feature="cnt", lags=[0, 1, 24]),
        Lags(feature="hr", lags=[1, 24]),
        Lags(feature="temp", lags=[1, 3]),
        Lags(feature="hum", lags=[1, 3]),
    ],
    diffs=[
        Lags(feature="cnt", lags=[1, 2]),
        Lags(feature="hr", lags=[1, 24]),
        Lags(feature="temp", lags=[1, 2, 24]),
        Lags(feature="hum", lags=[1, 2]),
    ],
    interactions=[
        InteractionFeats(feature_1="temp", feature_2="hum", operation="add"),
        InteractionFeats(feature_1="hum", feature_2="hr", operation="add"),
    ],
    rolling_windows=[
        Windows(feature="temp", windows=[3, 6]),
        Windows(feature="hum", windows=[3, 6]),
    ],
    drop_feats=["yr", "atemp", "casual", "registered", "datetime", "cnt"],
    target_col="cnt",
)

feat_eng = FeatureEngineer(data, config)

prepr_data: pl.DataFrame = feat_eng.create_all_features()
prepr_data.head()

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from src.ml.utils import compute_metrics, split_temporal_data

In [None]:
train_df, test_df = split_temporal_data(prepr_data)
x_train = train_df.drop("target").to_numpy()
y_train = train_df["target"].to_numpy()

x_test = test_df.drop("target").to_numpy()
y_test = test_df["target"].to_numpy()

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
n_splits: int = 5
test_size: int = 168  # 1 week of hourly data

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size, gap=0)
rf_reg = RandomForestRegressor(random_state=123)
all_rmse: list[float] = []
all_mae: list[float] = []
all_mape: list[float] = []
print(tscv)

for i, (train_index, test_index) in enumerate(tscv.split(x_train), start=1):
    print(f"Fold {i}:")
    x_tr, x_val = x_train[train_index], x_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]
    # Train the model
    rf_reg.fit(x_tr, y_tr)
    # Evaluate the model
    y_pred = rf_reg.predict(x_val)
    metrics = compute_metrics(y_val, y_pred)
    print(f"Validation Metrics: {metrics}")
    all_rmse.append(metrics.get("RMSE"))
    all_mae.append(metrics.get("MAE"))
    all_mape.append(metrics.get("MAPE"))

print("\nCross-Validation Results:")
print(f"Average MAE over {n_splits} folds: {np.mean(all_mae).round(2)}")
print(f"Average RMSE over {n_splits} folds: {np.mean(all_rmse).round(2)}")
print(f"Average MAPE over {n_splits} folds: {np.mean(all_mape).round(2)}")

In [None]:
# Evaluate on the test set
y_test_pred = rf_reg.predict(x_test)
metrics_test = compute_metrics(y_test, y_test_pred)
print(f"\nTest Set Metrics: {metrics_test}")