# Lab For Experimentation

In [1]:
import warnings
from typing import Any, Literal

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


# Table of Contents
- [Dataset Preparation](#Dataset-Preparation)
  - [Load Data](#load-data)
  - [Validate Data](#validate-data)
- [Baseline Model](#Baseline-Model)
- [Baseline Model With Lagged Target Features](#baseline-model-with-lagged-target-features)
- [Add More Features](#add-more-features)
- [Gradient Boosting With Hyperparameters Tuning](#Gradient-Boosting-with-Hyperparameters-Tuning)
- [Conclusions](#Conclusions)

# Dataset Preparation

### Load data

In [5]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

data.head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Validate Data

In [None]:
from src.ml.feature_engineering import (
    FeatureConfig,
    FeatureEngineer,
    InteractionFeats,
    Lags,
    Windows,
    create_lag_features,
)

# train_data, test_data = split_temporal_data(data, test_size=0.2)
# train_data.shape, test_data.shape

create_lag_features(
    nw.from_native(data.to_pandas()), target_col="cnt", lags=[1, 2, 3]
).head()  # .to_native()

┌─────────────────────────────────────────────────────────────────────────────┐
|                             Narwhals DataFrame                              |
|-----------------------------------------------------------------------------|
|              datetime  season  yr  mnth  hr  holiday  weekday  workingday  \|
|0  2011-01-01 00:00:00       1   0     1   0        0        6           0   |
|1  2011-01-01 01:00:00       1   0     1   1        0        6           0   |
|2  2011-01-01 02:00:00       1   0     1   2        0        6           0   |
|3  2011-01-01 03:00:00       1   0     1   3        0        6           0   |
|4  2011-01-01 04:00:00       1   0     1   4        0        6           0   |
|                                                                             |
|   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  \     |
|0           1  0.24  0.2879  0.81        0.0       3          13   16        |
|1           1  0.22  0.2727  0.80      

In [31]:
config: FeatureConfig = FeatureConfig(
    lags=[Lags(feature="cnt", lags=[1, 2, 3]), Lags(feature="temp", lags=[1, 2, 3])],
    diffs=[Lags(feature="cnt", lags=[1]), Lags(feature="temp", lags=[1])],
    interactions=[
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="add"),
        InteractionFeats(feature_1="cnt", feature_2="temp", operation="multiply"),
    ],
    rolling_windows=[
        Windows(feature="cnt", window=[3, 7]),
        Windows(feature="temp", window=[3, 7]),
    ],
    drop_feats=["atemp", "windspeed"],
)

# # Lags
# for lag in config.lags:
#     df = create_lag_features(
#         nw.from_native(data.to_pandas()), target_col=lag.feature, lags=lag.lags
#     )
# # Diffs
# for diff in config.diffs:
#     df = create_difference_features(
#         nw.from_native(df.to_pandas()), target_col=diff.feature, lags=diff.lags
#     )
# # Interactions
# for interaction in config.interactions:
#     df = create_interaction_features(
#         nw.from_native(df.to_pandas()),
#         feature_1=interaction.feature_1,
#         feature_2=interaction.feature_2,
#         operation=interaction.operation,
#     )
# df.head()

In [32]:
feat_eng = FeatureEngineer(data, config)

feat_eng.create_all_features().head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,casual,registered,cnt,is_weekend,sin_hour,cos_hour,sin_weekday,cos_weekday,cnt_lag_1hr,cnt_lag_2hr,cnt_lag_3hr,temp_lag_1hr,temp_lag_2hr,temp_lag_3hr,cnt_roll_mean_3hr,cnt_roll_std_3hr,cnt_roll_mean_7hr,cnt_roll_std_7hr,temp_roll_mean_3hr,temp_roll_std_3hr,temp_roll_mean_7hr,temp_roll_std_7hr,cnt_plus_temp,cnt_times_temp,cnt_diff_1hr,temp_diff_1hr,is_high_temp,is_high_hum,is_peak_hour,is_working_hour,is_business_hour
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i8,f64,f64,f64,f64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i8,i8,i8,i8,i8
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.81,3,13,16,1,0.0,1.0,-0.781831,0.62349,,,,,,,,,,,,,,,16.24,3.84,,,0,0,0,0,0
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.8,8,32,40,1,0.258819,0.965926,-0.781831,0.62349,16.0,,,0.24,,,,,,,,,,,40.22,8.8,24.0,-0.02,0,0,0,0,0
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.8,5,27,32,1,0.5,0.866025,-0.781831,0.62349,40.0,16.0,,0.22,0.24,,29.333333,12.220202,,,0.226667,0.011547,,,32.22,7.04,-8.0,0.0,0,0,0,0,0
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.75,3,10,13,1,0.707107,0.707107,-0.781831,0.62349,32.0,40.0,16.0,0.22,0.22,0.24,28.333333,13.868429,,,0.226667,0.011547,,,13.24,3.12,-19.0,0.02,0,0,0,0,0
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.75,0,1,1,1,0.866025,0.5,-0.781831,0.62349,13.0,32.0,40.0,0.24,0.22,0.22,15.333333,15.631165,,,0.233333,0.011547,,,1.24,0.24,-12.0,0.0,0,0,0,0,0
