# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


# Dataset Preparation

### Load data

In [4]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

data.head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Validate Data

In [61]:
from src.ml.feature_engineering import (
    FeatureConfig,
    FeatureEngineer,
    InteractionFeats,
    Lags,
    Windows,
    create_lag_features,
)

# train_data, test_data = split_temporal_data(data, test_size=0.2)
# train_data.shape, test_data.shape

create_lag_features(
    nw.from_native(data.to_pandas()), target_col="cnt", lags=[1, 2, 3]
).head()  # .to_native()

┌─────────────────────────────────────────────────────────────────────────────┐
|                             Narwhals DataFrame                              |
|-----------------------------------------------------------------------------|
|              datetime  season  yr  mnth  hr  holiday  weekday  workingday  \|
|0  2011-01-01 00:00:00       1   0     1   0        0        6           0   |
|1  2011-01-01 01:00:00       1   0     1   1        0        6           0   |
|2  2011-01-01 02:00:00       1   0     1   2        0        6           0   |
|3  2011-01-01 03:00:00       1   0     1   3        0        6           0   |
|4  2011-01-01 04:00:00       1   0     1   4        0        6           0   |
|                                                                             |
|   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  \     |
|0           1  0.24  0.2879  0.81        0.0       3          13   16        |
|1           1  0.22  0.2727  0.80      

In [None]:
config: FeatureConfig = FeatureConfig(
    lags=[Lags(feature="cnt", lags=[1, 2, 3]), Lags(feature="temp", lags=[1, 2, 3])],
    diffs=[Lags(feature="cnt", lags=[1]), Lags(feature="temp", lags=[1])],
    interactions=[
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="add"),
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="multiply"),
    ],
    rolling_windows=[
        Windows(feature="cnt", windows=[3, 7]),
        Windows(feature="temp", windows=[3, 7]),
    ],
    drop_feats=["atemp", "windspeed", "cnt"],
    target_col="cnt",
)

# # Lags
# for lag in config.lags:
#     df = create_lag_features(
#         nw.from_native(data.to_pandas()), target_col=lag.feature, lags=lag.lags
#     )
# # Diffs
# for diff in config.diffs:
#     df = create_difference_features(
#         nw.from_native(df.to_pandas()), target_col=diff.feature, lags=diff.lags
#     )
# # Interactions
# for interaction in config.interactions:
#     df = create_interaction_features(
#         nw.from_native(df.to_pandas()),
#         feature_1=interaction.feature_1,
#         feature_2=interaction.feature_2,
#         operation=interaction.operation,
#     )
# df.head()

In [7]:
feat_eng = FeatureEngineer(data, config)

feat_eng.create_all_features().null_count().sum_horizontal()

sum
u32
0


In [9]:
df = pl.DataFrame({"age": [25, 30, 35], "salary": [50000, 60000, 70000]})

df = df.with_columns(pl.col("age").shift(0).alias("age_shift_0"))

In [62]:
config: FeatureConfig = FeatureConfig(
    lags=[
        Lags(feature="cnt", lags=[0, 1, 24]),
        Lags(feature="hr", lags=[1, 24]),
        Lags(feature="temp", lags=[1, 3]),
        Lags(feature="hum", lags=[1, 3]),
    ],
    diffs=[
        Lags(feature="cnt", lags=[1, 2]),
        Lags(feature="hr", lags=[1, 24]),
        Lags(feature="temp", lags=[1, 2, 24]),
        Lags(feature="hum", lags=[1, 2]),
    ],
    interactions=[
        InteractionFeats(feature_1="temp", feature_2="hum", operation="add"),
        InteractionFeats(feature_1="hum", feature_2="hr", operation="add"),
    ],
    rolling_windows=[
        Windows(feature="temp", windows=[3, 6]),
        Windows(feature="hum", windows=[3, 6]),
    ],
    drop_feats=["yr", "atemp", "casual", "registered", "datetime", "cnt"],
    target_col="cnt",
)

feat_eng = FeatureEngineer(data, config)

prepr_data: pl.DataFrame = feat_eng.create_all_features()
prepr_data.head()

season,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,is_weekend,sin_hour,cos_hour,sin_weekday,cos_weekday,cnt_lag_0hr,cnt_lag_1hr,cnt_lag_24hr,hr_lag_1hr,hr_lag_24hr,temp_lag_1hr,temp_lag_3hr,hum_lag_1hr,hum_lag_3hr,temp_rolling_mean_3hr,temp_rolling_median_3hr,temp_rolling_mean_6hr,temp_rolling_median_6hr,hum_rolling_mean_3hr,hum_rolling_median_3hr,hum_rolling_mean_6hr,hum_rolling_median_6hr,temp_plus_hum,hum_plus_hr,cnt_diff_1hr,cnt_diff_2hr,hr_diff_1hr,hr_diff_24hr,temp_diff_1hr,temp_diff_2hr,temp_diff_24hr,hum_diff_1hr,hum_diff_2hr,is_high_temp,is_high_hum,is_peak_hour,is_working_hour,is_business_hour,target
i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i8,f64,f64,f64,f64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i8,i8,i8,i8,i8,i64
1,1,0,0,6,0,1,0.24,0.81,0.0,1,0.0,1.0,-0.781831,0.62349,16,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.05,0.81,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,40
1,1,1,0,6,0,1,0.22,0.8,0.0,1,0.258819,0.965926,-0.781831,0.62349,40,16,16,0,0,0.24,0.24,0.81,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,1.8,24,16,1,0,-0.02,-0.02,0.22,-0.01,-0.01,0,0,0,0,0,32
1,1,2,0,6,0,1,0.22,0.8,0.0,1,0.5,0.866025,-0.781831,0.62349,32,40,16,1,0,0.22,0.24,0.8,0.81,0.226667,0.22,0.233333,0.24,0.803333,0.8,0.776667,0.775,1.02,2.8,-8,16,1,0,0.0,-0.02,0.22,0.0,-0.01,0,0,0,0,0,13
1,1,3,0,6,0,1,0.24,0.75,0.0,1,0.707107,0.707107,-0.781831,0.62349,13,32,16,2,0,0.22,0.24,0.8,0.81,0.226667,0.22,0.233333,0.24,0.783333,0.8,0.776667,0.775,0.99,3.75,-19,-27,1,0,0.02,0.02,0.22,-0.05,-0.05,0,0,0,0,0,1
1,1,4,0,6,0,1,0.24,0.75,0.0,1,0.866025,0.5,-0.781831,0.62349,1,13,16,3,0,0.24,0.22,0.75,0.8,0.233333,0.24,0.233333,0.24,0.766667,0.75,0.776667,0.775,0.99,4.75,-12,-31,1,0,0.0,0.02,0.22,0.0,-0.05,0,0,0,0,0,1


In [55]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from src.ml.utils import compute_metrics, split_temporal_data

In [56]:
train_df, test_df = split_temporal_data(prepr_data)
x_train = train_df.drop("target").to_numpy()
y_train = train_df["target"].to_numpy()

x_test = test_df.drop("target").to_numpy()
y_test = test_df["target"].to_numpy()

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(11122, 48) (11122,) (2781, 48) (2781,)


In [57]:
n_splits: int = 5
test_size: int = 168  # 1 week of hourly data

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size, gap=0)
rf_reg = RandomForestRegressor(random_state=123)
all_rmse: list[float] = []
all_mae: list[float] = []
all_mape: list[float] = []
print(tscv)

for i, (train_index, test_index) in enumerate(tscv.split(x_train), start=1):
    print(f"Fold {i}:")
    x_tr, x_val = x_train[train_index], x_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]
    # Train the model
    rf_reg.fit(x_tr, y_tr)
    # Evaluate the model
    y_pred = rf_reg.predict(x_val)
    metrics = compute_metrics(y_val, y_pred)
    print(f"Validation Metrics: {metrics}")
    all_rmse.append(metrics.get("RMSE"))
    all_mae.append(metrics.get("MAE"))
    all_mape.append(metrics.get("MAPE"))

print("\nCross-Validation Results:")
print(f"Average MAE over {n_splits} folds: {np.mean(all_mae).round(2)}")
print(f"Average RMSE over {n_splits} folds: {np.mean(all_rmse).round(2)}")
print(f"Average MAPE over {n_splits} folds: {np.mean(all_mape).round(2)}")

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=168)
Fold 1:
Validation Metrics: {'MAE': 33.59, 'RMSE': 55.99, 'MAPE': 28.95}
Fold 2:
Validation Metrics: {'MAE': 52.54, 'RMSE': 80.85, 'MAPE': 24.67}
Fold 3:
Validation Metrics: {'MAE': 34.86, 'RMSE': 59.87, 'MAPE': 27.01}
Fold 4:
Validation Metrics: {'MAE': 41.42, 'RMSE': 62.99, 'MAPE': 28.05}
Fold 5:
Validation Metrics: {'MAE': 28.04, 'RMSE': 41.6, 'MAPE': 25.45}

Cross-Validation Results:
Average MAE over 5 folds: 38.09
Average RMSE over 5 folds: 60.26
Average MAPE over 5 folds: 26.83


In [58]:
# Evaluate on the test set
y_test_pred = rf_reg.predict(x_test)
metrics_test = compute_metrics(y_test, y_test_pred)
print(f"\nTest Set Metrics: {metrics_test}")


Test Set Metrics: {'MAE': 41.74, 'RMSE': 66.65, 'MAPE': 24.1}
