# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


# Table of Contents
- [Dataset Preparation](#Dataset-Preparation)
  - [Load Data](#load-data)
  - [Validate Data](#validate-data)
- [Baseline Model](#Baseline-Model)
- [Baseline Model With Lagged Target Features](#baseline-model-with-lagged-target-features)
- [Add More Features](#add-more-features)
- [Gradient Boosting With Hyperparameters Tuning](#Gradient-Boosting-with-Hyperparameters-Tuning)
- [Conclusions](#Conclusions)

In [4]:
from src.utilities.data_validator import data_validator

# Dataset Preparation

### Load data

In [5]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

data.head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Validate Data

In [6]:
# Validate the data
data_report: dict[str, Any] = data_validator(data)
console.print(data_report)

In [7]:
data.head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Comment

- Drop the columns with high cardinality
    - `datetime`

- Drop irrelevant columns
    - `casual` (used to create the target)
    - `registered` (used to create the target)
    - `atemp` (highly correlated with `temp`) 
    - `yr` (not relevant)


In [8]:
year: dict[str, Any] = data_report.get("summary_statistics").get("numeric")[1]
atemp: dict[str, Any] = data_report.get("summary_statistics").get("numeric")[9]
casual: dict[str, Any] = data_report.get("summary_statistics").get("numeric")[-2]
registered: dict[str, Any] = data_report.get("summary_statistics").get("numeric")[-1]
datetime: dict[str, Any] = data_report.get("summary_statistics").get("categorical")[0]

console.print(
    f"Numeric:\n{year}\n\n{atemp}\n\n{casual}\n\n{registered}\n\nCategorical: \n{datetime}"
)

In [9]:
columns_to_drop: list[str] = ["yr", "atemp", "casual", "registered", "datetime"]
df: pl.DataFrame = data.clone().drop(columns_to_drop)

df.head(3)

season,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt
i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64
1,1,0,0,6,0,1,0.24,0.81,0.0,16
1,1,1,0,6,0,1,0.22,0.8,0.0,40
1,1,2,0,6,0,1,0.22,0.8,0.0,32


In [34]:
target_column: str = "cnt"

In [None]:
# Since we want to maintain the temporal order, we'll
# use a custom function for splitting the data

# it's assumed the data has been sorted in ascending order of time
# get the train, val, test ratios
# -10% of the training data is used for validation
train_size: float = 0.9
data_array: np.ndarray = df.to_numpy()
train_array: np.ndarray = data_array[: int(train_size * data_array.shape[0])]
test_array: np.ndarray = data_array[train_array.shape[0] :]

train_array.shape, test_array.shape
# convert the ratios to int values and select a slice of the data corresponding to the ratio

In [None]:
from sklearn.metrics._regression import mean_absolute_error, root_mean_squared_error


def split_temporal_data(
    data: pl.DataFrame, test_size: float = 0.2
) -> tuple[pl.DataFrame, pl.DataFrame]:
    feature_names: list[str] = data.columns
    data_array: np.ndarray = data.to_numpy()
    train_size: float = int((1 - test_size) * data_array.shape[0])

    train_array: np.ndarray = data_array[:train_size]
    test_array: np.ndarray = data_array[train_size:]

    return pl.DataFrame(train_array, schema=feature_names), pl.DataFrame(
        test_array, schema=feature_names
    )


def compute_metrics(
    y_true: np.ndarray | list, y_pred: np.ndarray | list
) -> dict[str, float]:
    """
    Compute evaluation metrics between true and predicted values.

    Metrics returned:
    - MAPE: Mean Absolute Percentage Error (in %)
    - MAE: Mean Absolute Error
    - RMSE: Root Mean Squared Error

    Parameters:
    ----------
    y_true : array-like
        Ground truth values.
    y_pred : array-like
        Predicted values.

    Returns:
    -------
    dict
        Dictionary with keys 'MAPE', 'MAE', and 'RMSE' and their float values.
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    mape: float = (
        np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 0.01, y_true))) * 100
    ).item()

    return {
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "MAPE": round(mape, 2),
    }

### Target Definition

- We'll use the next hour `cnt` as the target variable to predict bike rentals.
- This is because bike rental services typically operate on an hourly basis, and predicting the next hour's demand can help in resource allocation and planning.
- We'll shift the `cnt` column by one hour to create the target variable, ensuring that our model learns to predict future demand based on current and past data.
- i.e. `df["target"] = df["cnt"].shift(-1)` if `cnt`=[5, 10, 15, 20] then `target`=[10, 15, 20, NaN]
  - i.e. instead of predicting the current hour's demand, we are training the model to predict the demand for the next hour based on the current and past data.

In [None]:
df = df.with_columns(pl.col("cnt").shift(-1).alias("target")).with_columns(
    pl.col("target").fill_null(strategy="forward")
)

df.head(3)

In [None]:
df.null_count()

In [None]:
df = df.drop("cnt")

train_df, test_df = split_temporal_data(df)

x_train = train_df.drop("target").to_numpy()
y_train = train_df["target"].to_numpy()

x_test = test_df.drop("target").to_numpy()
y_test = test_df["target"].to_numpy()

x_train.shape, y_train.shape, x_test.shape, y_test.shape

<br>

# Baseline Model

- Create a simple and basic ML model to be used as reference.
- I used a `RandomForestRegressor` because it requires little tuning.

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=123)
rf_reg.fit(x_train, y_train)
rf_reg.score(x_test, y_test)

y_pred = rf_reg.predict(x_test)

In [None]:
metrics_base = compute_metrics(y_test, y_pred)
metrics_base

### Comment

- The model tends to underpredict during most of the high-peak periods.
- This suggests that the model is underestimating the number of bike rentals.

In [None]:
import plotly.graph_objects as go

n: int = 300

fig = go.Figure()

# Add actual values trace
fig.add_trace(
    go.Scatter(
        x=list(range(n)),
        y=y_test[:n],
        mode="lines",
        name="y_test (Actual)",
        line={"color": "blue", "width": 2},
    )
)

# Add predicted values trace
fig.add_trace(
    go.Scatter(
        x=list(range(n)),
        y=y_pred[:n],
        mode="lines",
        name="y_pred (Predicted)",
        line={"color": "red", "width": 2},
    )
)

# Update layout for better visualization
fig.update_layout(
    title="Actual vs Predicted Bike Rental Counts",
    xaxis_title="Time Steps",
    yaxis_title="Bike Rental Count",
    hovermode="x unified",
    template="plotly_white",
    width=800,
    height=500,
)

# Show the interactive plot
fig.show()

### Dummy Model

- Let's create a dummy model that predicts the next hour's bike rentals based on exactly the same hour of the previous day.

In [None]:
y_dummy = pl.Series(y_test).shift(1).fill_null(strategy="backward").to_numpy()
y_dummy

In [None]:
metrics_dummy: dict[str, float] = compute_metrics(y_test, y_dummy)

console.print(f"Metrics (Base Model): {metrics_base}", style="info")
console.print(f"Metrics (Dummy Model): {metrics_dummy}", style="info")

### Comment

- The baseline model's performance is not a significant improvement over the dummy model. 

- While the Mean Absolute Percentage Error (`MAPE`) improved notably, the Mean Absolute Error (`MAE`) and other metrics show little change, suggesting that feature engineering is necessary to improve the model.

In [None]:
def pct_change(old: float, new: float) -> float:
    """Calculate the percentage change from old to new value."""
    if old == 0:
        return float("inf")  # Avoid division by zero
    return (abs(new - old) / old) * 100


mae_base, mae_dummy = metrics_base.get("MAE"), metrics_dummy.get("MAE")
rmse_base, rmse_dummy = metrics_base.get("RMSE"), metrics_dummy.get("RMSE")
mape_base, mape_dummy = metrics_base.get("MAPE"), metrics_dummy.get("MAPE")

console.print(
    f"MAE Change: {pct_change(old=mae_base, new=mae_dummy):.2f}%", style="highlight"
)
console.print(
    f"RMSE Change: {pct_change(old=rmse_base, new=rmse_dummy):.2f}%", style="highlight"
)
console.print(
    f"MAPE Change: {pct_change(old=mape_base, new=mape_dummy):.2f}%", style="highlight"
)

# Baseline Model With Lagged Target Features

- Lagged features are past values of a time series used to capture autocorrelation (i.e. the relationship between a data point and its previous values). 

- This is based on the idea that the past influences the future.

- By including lagged features, the model gains a historical context that helps it identify:

  - `Trends`: Long-term upward or downward movements.

  - `Seasonality`: Regular, repeating patterns, such as daily or weekly cycles.

  - `Temporal dependencies`: How a value at one time step is directly related to its value at a previous time step.

In [None]:
import narwhals as nw


def _calculate_corr(x: list[float] | np.ndarray, y: list[float] | np.ndarray) -> float:
    return np.corrcoef(x, y)[0][1].item()


def compute_autocorrelation(series: nw.Series, max_lag: int = 24) -> dict[int, float]:
    """
    Compute autocorrelation for a time series using Polars.

    Parameters
    ----------
    series : pl.Series
        The time series data.
    max_lag : int, default=24
        Maximum lag to compute autocorrelation for.

    Returns
    -------
    dict[int, float]
        Dictionary mapping lag to autocorrelation value.
    """
    autocorr_values: dict[int, float] = {}

    for lag in range(1, max_lag + 1):
        try:
            # Create a DataFrame with original and lagged series
            df_corr: nw.DataFrame = nw.from_native(
                pl.DataFrame({"original": series, "lagged": series.shift(lag)})
            ).drop_nulls()

            # Compute correlation if we have sufficient data
            if df_corr.shape[0] > 1:
                correlation = _calculate_corr(df_corr["original"], df_corr["lagged"])
                autocorr_values[lag] = correlation
            else:
                autocorr_values[lag] = None

        except Exception as e:
            console.print(f"Error computing lag {lag}: {e}", style="error")
            autocorr_values[lag] = None

    return autocorr_values

In [None]:
# Compute autocorrelation of the target variable
auto_correlation: dict[int, float] = compute_autocorrelation(
    series=df["target"], max_lag=24
)
for lag, autocorr in auto_correlation.items():
    console.print(f"Lag {lag}: Autocorrelation = {autocorr:.3f}", style="info")

### Comment

- **Strong Hourly Dependency**: The high correlation at Lag 1 (r=0.845) shows that bike rentals are highly dependent on the previous hour's count.

- **Clear Daily Seasonality**: The strong correlation at Lag 24 (r=0.809) confirms a predictable daily pattern in bike usage.

- **Inverse Peak/Off-Peak Relationship**: The negative correlations between 6 and 18 hours apart indicate that peak and off-peak periods are inversely related.

- **Modeling Strategy**: Lagged features (especially 1, 2, 23, and 24 hours) and time-of-day features will be essential for building an effective model.

In [None]:
columns_to_drop: list[str] = ["yr", "atemp", "casual", "registered", "datetime"]
df: pl.DataFrame = data.clone().drop(columns_to_drop)

df.head(3)

In [None]:
# Shift the `cnt` column by one hour to create the target variable
# i.e. predict the next hour's demand based on current and past data
df = df.with_columns(pl.col("cnt").shift(-1).alias("target")).with_columns(
    pl.col("target").fill_null(strategy="forward")
)
df.head()

In [None]:
# Add lags 1, 23 and 24 hours
df = df.with_columns(
    [
        pl.col("cnt").alias("current"),
        pl.col("cnt").shift(1).alias("lag_1"),
        pl.col("cnt").shift(23).alias("lag_23"),
        pl.col("cnt").shift(24).alias("lag_24"),
    ]
).fill_null(strategy="backward")
print("Check for null values after adding lag features:")
display(df.null_count())
df.head()

In [None]:
df = df.drop(["cnt"])

train_df, test_df = split_temporal_data(df)

x_train = train_df.drop("target").to_numpy()
y_train = train_df["target"].to_numpy()

x_test = test_df.drop("target").to_numpy()
y_test = test_df["target"].to_numpy()

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
rf_reg = RandomForestRegressor(random_state=123)
rf_reg.fit(x_train, y_train)
rf_reg.score(x_test, y_test)

y_pred = rf_reg.predict(x_test)

In [None]:
metrics_base_with_lagged_feats = compute_metrics(y_test, y_pred)
metrics_base_with_lagged_feats

In [None]:
n: int = 300

fig = go.Figure()

# Add actual values trace
fig.add_trace(
    go.Scatter(
        x=list(range(n)),
        y=y_test[:n],
        mode="lines",
        name="y_test (Actual)",
        line={"color": "blue", "width": 2},
    )
)

# Add predicted values trace
fig.add_trace(
    go.Scatter(
        x=list(range(n)),
        y=y_pred[:n],
        mode="lines",
        name="y_pred (Predicted with Lagged Features)",
        line={"color": "red", "width": 2},
    )
)

# Update layout for better visualization
fig.update_layout(
    title="Actual vs Predicted Bike Rental Counts",
    xaxis_title="Time Steps",
    yaxis_title="Bike Rental Count",
    hovermode="x unified",
    template="plotly_white",
    width=800,
    height=500,
)

# Show the interactive plot
fig.show()

In [None]:
console.print(f"Metrics (Base Model): {metrics_base}", style="info")
console.print(f"Metrics (Dummy Model): {metrics_dummy}", style="info")
console.print(
    f"Metrics (Base Model with Lagged Features): {metrics_base_with_lagged_feats}",
    style="info",
)

In [None]:
mae_base, mae_dummy = (
    metrics_base_with_lagged_feats.get("MAE"),
    metrics_dummy.get("MAE"),
)
rmse_base, rmse_dummy = (
    metrics_base_with_lagged_feats.get("RMSE"),
    metrics_dummy.get("RMSE"),
)
mape_base, mape_dummy = (
    metrics_base_with_lagged_feats.get("MAPE"),
    metrics_dummy.get("MAPE"),
)

console.print(
    f"MAE Change: {pct_change(old=mae_base, new=mae_dummy):.2f}%", style="highlight"
)
console.print(
    f"RMSE Change: {pct_change(old=rmse_base, new=rmse_dummy):.2f}%", style="highlight"
)
console.print(
    f"MAPE Change: {pct_change(old=mape_base, new=mape_dummy):.2f}%", style="highlight"
)

# Add More Features

- Add `temporal` features
  - `dayofweek`
  - `month`
  - `is_weekend`
  - `is_holiday`

- Add `seasonal` features
  - spring, summer, fall, winter
  - `sin_hour`, `cos_hour`, `sin_day_of_week`, `cos_day_of_week`

- Add `trend` features
  - `cnt_diff_1hr`, `cnt_diff_3hr`, `cnt_diff_6hr`, `cnt_diff_24hr`

- Add `lagged` features
  - `cnt_lag_1`, `cnt_lag_2`, `cnt_lag_23`, `cnt_lag_24`
  - `hr_lag_1`, `hr_lag_2`

- Add `exogenous` (not time-based) features
  - `temp`, `hum`, `windspeed`, `weather`
  - `holiday`, `event`
  - `location` (if available)

- Add `derived` features
  - `temp_change_1hr`, `temp_change_3hr`
  - Add `rolling` features
    - `cnt_roll_mean_3hr`, `cnt_roll_std_3hr`
    - `cnt_roll_mean_6hr`, `cnt_roll_std_6hr`
    - `cnt_roll_mean_24hr`, `cnt_roll_std_24hr`
  - Add `interaction` features
    - `temp_hum_interaction`, `temp_wind_interaction`
  - Add `binary` features
    - `is_high_temp`, `is_low_hum`, `is_peak_hour`, `is_working_hour`, `is_business_hour`, `is_weekend`

In [None]:
columns_to_drop: list[str] = ["yr", "atemp", "casual", "registered", "datetime"]
df: pl.DataFrame = data.clone().drop(columns_to_drop)

# Shift the `cnt` column by one hour to create the target variable
# i.e. predict the next hour's demand based on current and past data
df = df.with_columns(pl.col("cnt").shift(-1).alias("target")).with_columns(
    pl.col("target").fill_null(strategy="forward")
)
df.head()

In [None]:
# Add lags 1, 23 and 24 hours
df = df.with_columns(
    [
        # cnt lags
        pl.col("cnt").alias("current"),
        pl.col("cnt").shift(1).alias("cnt_lag_1"),
        pl.col("cnt").shift(23).alias("cnt_lag_23"),
        pl.col("cnt").shift(24).alias("cnt_lag_24"),
        # hr lags
        # pl.col("hr").shift(1).alias("hr_lag_1"),
        pl.col("hum").shift(1).alias("hum_lag_1"),
        pl.col("hum").shift(2).alias("hum_lag_2"),
    ]
).fill_null(strategy="backward")
print("Check for null values after adding lag features:")
display(df.null_count())
df.head()

In [None]:
df = df.drop(["cnt"])

train_df, test_df = split_temporal_data(df)

x_train = train_df.drop("target").to_numpy()
y_train = train_df["target"].to_numpy()

x_test = test_df.drop("target").to_numpy()
y_test = test_df["target"].to_numpy()

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
rf_reg = RandomForestRegressor(random_state=123)
rf_reg.fit(x_train, y_train)
rf_reg.score(x_test, y_test)

y_pred = rf_reg.predict(x_test)

In [None]:
metrics_base_with_lagged_hr_feats = compute_metrics(y_test, y_pred)
metrics_base_with_lagged_hr_feats

In [None]:
console.print(f"Metrics (Base Model): {metrics_base}", style="info")
console.print(f"Metrics (Dummy Model): {metrics_dummy}", style="info")
console.print(
    f"Metrics (Base Model with Lagged Features): {metrics_base_with_lagged_feats}",
    style="info",
)
console.print(
    f"Metrics (Base Model with Lagged HR Features): {metrics_base_with_lagged_hr_feats}",
    style="info",
)