In [None]:
import warnings
from typing import Any

# Standard imports
import numpy as np
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/End-to-end-Sale-Forecasting


In [None]:
from src.utilities.data_gen import RealisticSalesDataGenerator

gen_data = RealisticSalesDataGenerator(start_date="2022-01-01", end_date="2022-01-05", seed=123)
output_dict: dict[str, Any] = gen_data.generate_sales_data(output_dir="../data/sales_data")
output_dict

{'sales': ['../data/sales_data/sales/year=2022/month=01/day=02/sales_2022-01-02.parquet',
  '../data/sales_data/sales/year=2022/month=01/day=03/sales_2022-01-03.parquet',
  '../data/sales_data/sales/year=2022/month=01/day=04/sales_2022-01-04.parquet'],
 'inventory': ['../data/sales_data/inventory/year=2022/week=52/inventory_2022-01-02.parquet'],
 'customer_traffic': ['../data/sales_data/customer_traffic/year=2022/month=01/day=01/traffic_2022-01-01.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=02/traffic_2022-01-02.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=03/traffic_2022-01-03.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=04/traffic_2022-01-04.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=05/traffic_2022-01-05.parquet'],
 'promotions': ['../data/sales_data/promotions/promotions.parquet'],
 'store_events': ['../data/sales_data/store_events/events.parquet']}

In [36]:
total_files = sum(len(paths) for paths in output_dict.values())
total_files

11

In [None]:
temp_pandas: pd.DataFrame = pd.read_parquet(output_dict["sales"][:3])
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2022-01-02,store_002,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,2
1,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
2,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
3,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
4,2022-01-02,store_006,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
5,2022-01-02,store_009,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,2
6,2022-01-02,store_009,SPRT_001,Sports,2,29,0.2,46.4,26.1,20.3,2022,1,2
7,2022-01-02,store_009,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
8,2022-01-03,store_002,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3
9,2022-01-04,store_002,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,4


In [57]:
temp_df: pl.DataFrame = pl.read_parquet(output_dict["sales"][:3])
temp_df = temp_df.with_columns(pl.col("date").cast(pl.Date).alias("date"))
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit
date,str,str,str,i64,i64,f64,f64,f64,f64
2022-01-02,"""store_002""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8
2022-01-02,"""store_002""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-02,"""store_002""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2022-01-02,"""store_003""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-02,"""store_006""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-02,"""store_009""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8
2022-01-02,"""store_009""","""SPRT_001""","""Sports""",2,29,0.2,46.4,26.1,20.3
2022-01-02,"""store_009""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2022-01-03,"""store_002""","""SPRT_002""","""Sports""",1,49,0.2,39.2,26.95,12.25
2022-01-04,"""store_002""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8


In [None]:
import holidays

us_holidays = holidays.UnitedStates()

temp_df = temp_df.with_columns(
    pl.col("date").dt.day().alias("day"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.year().alias("year"),
    pl.col("date").dt.weekday().alias("day_of_week"),
    pl.col("date").dt.quarter().alias("quarter"),
    pl.col("date").dt.week().alias("week_of_year"),
    (pl.col("date").dt.weekday() > 5).alias("is_weekend").cast(pl.Int8),
    pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
)
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8
2022-01-02,"""store_002""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8,2,1,2022,7,1,52,1,0
2022-01-02,"""store_002""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-02,"""store_002""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0
2022-01-02,"""store_003""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-02,"""store_006""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-02,"""store_009""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8,2,1,2022,7,1,52,1,0
2022-01-02,"""store_009""","""SPRT_001""","""Sports""",2,29,0.2,46.4,26.1,20.3,2,1,2022,7,1,52,1,0
2022-01-02,"""store_009""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0
2022-01-03,"""store_002""","""SPRT_002""","""Sports""",1,49,0.2,39.2,26.95,12.25,3,1,2022,1,1,1,0,0
2022-01-04,"""store_002""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8,4,1,2022,2,1,1,0,0


In [69]:
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2022-01-02,store_002,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,2
1,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
2,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
3,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
4,2022-01-02,store_006,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
5,2022-01-02,store_009,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,2
6,2022-01-02,store_009,SPRT_001,Sports,2,29,0.2,46.4,26.1,20.3,2022,1,2
7,2022-01-02,store_009,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
8,2022-01-03,store_002,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3
9,2022-01-04,store_002,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,4


In [71]:
temp_pandas = temp_pandas.copy()
windows = app_config.features.rolling_features["windows"]
functions = app_config.features.rolling_features["functions"]

if group_cols:
    for window in windows:
        for func in functions:
            col_name = f"{target_col}_rolling_{window}_{func}"
            df[col_name] = df.groupby(group_cols)[target_col].transform(
                lambda x: x.rolling(window, min_periods=1).agg(func)
            )

NameError: name 'group_cols' is not defined

In [7]:
from src.config import app_config

In [67]:
console.print(app_config.features.rolling_features)

In [None]:
class FeatureEngineer:
    def __init__(self) -> None:
        self.feature_config = app_config.features
        self.validation_config = app_config.validation

    def create_date_features(self, df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
        """Create date features from a date column."""
        date_features: list[str] = self.feature_config.date_features
        df = df.clone()

        # Convert to datetime
        df = df.with_columns(pl.col(date_col).str.strptime(pl.Date, fmt="%Y-%m-%d"))
        if "date" in date_features:
            return df.with_columns(
                pl.col("date").dt.day().alias("day"),
                pl.col("date").dt.month().alias("month"),
                pl.col("date").dt.year().alias("year"),
                pl.col("date").dt.weekday().alias("day_of_week"),
                pl.col("date").dt.quarter().alias("quarter"),
                pl.col("date").dt.week().alias("week_of_year"),
                (pl.col("date").dt.weekday() > 5).alias("is_weekend").cast(pl.Int8),
                pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
            )
        return df

    def create_lag_features(self, df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None) -> pl.DataFrame:
        """Create lag features for the sales data."""
        df = df.clone()
        windows = self.feature_config.rolling_features["windows"]
        functions = self.feature_config.rolling_features["functions"]
        if group_cols:
            for window in windows:
                for func in functions:
                    pass
                    



In [61]:
set(app_config.features.date_features).issubset(temp_df.columns)

False

In [62]:
app_config.features.date_features

['year',
 'month',
 'day',
 'dayofweek',
 'quarter',
 'weekofyear',
 'is_weekend',
 'is_holiday']

In [None]:
import holidays

us_holidays = holidays.UnitedStates()

temp_df = temp_df.with_columns(
    pl.col("date").dt.day().alias("day"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.year().alias("year"),
    pl.col("date").dt.weekday().alias("day_of_week"),
    pl.col("date").dt.quarter().alias("quarter"),
    pl.col("date").dt.week().alias("week_of_year"),
    (pl.col("date").dt.weekday() > 5).alias("is_weekend").cast(pl.Int8),
    pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
)
temp_df