In [1]:
import warnings
from typing import Any

# Standard imports
import numpy as np
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/End-to-end-Sale-Forecasting


In [None]:
from src.utilities.data_gen import RealisticSalesDataGenerator

gen_data = RealisticSalesDataGenerator(start_date="2022-01-01", end_date="2022-01-05", seed=123)
output_dict: dict[str, Any] = gen_data.generate_sales_data(output_dir="../data/sales_data")
output_dict

{'sales': ['../data/sales_data/sales/year=2022/month=01/day=02/sales_2022-01-02.parquet',
  '../data/sales_data/sales/year=2022/month=01/day=03/sales_2022-01-03.parquet',
  '../data/sales_data/sales/year=2022/month=01/day=04/sales_2022-01-04.parquet'],
 'inventory': ['../data/sales_data/inventory/year=2022/week=52/inventory_2022-01-02.parquet'],
 'customer_traffic': ['../data/sales_data/customer_traffic/year=2022/month=01/day=01/traffic_2022-01-01.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=02/traffic_2022-01-02.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=03/traffic_2022-01-03.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=04/traffic_2022-01-04.parquet',
  '../data/sales_data/customer_traffic/year=2022/month=01/day=05/traffic_2022-01-05.parquet'],
 'promotions': ['../data/sales_data/promotions/promotions.parquet'],
 'store_events': ['../data/sales_data/store_events/events.parquet']}

In [5]:
total_files = sum(len(paths) for paths in output_dict.values())
total_files

11

In [6]:
temp_pandas: pd.DataFrame = pd.read_parquet(output_dict["sales"][:3])
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2022-01-02,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
1,2022-01-02,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
2,2022-01-02,store_002,HOME_001,Home,1,79,0.2,63.2,55.3,7.9,2022,1,2
3,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
4,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
5,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
6,2022-01-03,store_001,CLTH_003,Clothing,1,149,0.2,119.2,89.4,29.8,2022,1,3
7,2022-01-03,store_001,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,3
8,2022-01-03,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,3
9,2022-01-03,store_001,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3


In [7]:
temp_df: pl.DataFrame = pl.read_parquet(output_dict["sales"][:3])
temp_df = temp_df.with_columns(pl.col("date").cast(pl.Date).alias("date"))
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit
date,str,str,str,i64,i64,f64,f64,f64,f64
2022-01-02,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-02,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2022-01-02,"""store_002""","""HOME_001""","""Home""",1,79,0.2,63.2,55.3,7.9
2022-01-02,"""store_002""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-02,"""store_002""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2022-01-02,"""store_003""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-03,"""store_001""","""CLTH_003""","""Clothing""",1,149,0.2,119.2,89.4,29.8
2022-01-03,"""store_001""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8
2022-01-03,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15
2022-01-03,"""store_001""","""SPRT_002""","""Sports""",1,49,0.2,39.2,26.95,12.25


In [None]:
import holidays

us_holidays = holidays.UnitedStates()

temp_df = temp_df.with_columns(
    pl.col("date").dt.day().alias("day"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.year().alias("year"),
    pl.col("date").dt.weekday().alias("day_of_week"),
    pl.col("date").dt.quarter().alias("quarter"),
    pl.col("date").dt.week().alias("week_of_year"),
    (pl.col("date").dt.weekday() > 5).alias("is_weekend").cast(pl.Int8),
    pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
)
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8
2022-01-02,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-02,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0
2022-01-02,"""store_002""","""HOME_001""","""Home""",1,79,0.2,63.2,55.3,7.9,2,1,2022,7,1,52,1,0
2022-01-02,"""store_002""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-02,"""store_002""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0
2022-01-02,"""store_003""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0
2022-01-03,"""store_001""","""CLTH_003""","""Clothing""",1,149,0.2,119.2,89.4,29.8,3,1,2022,1,1,1,0,0
2022-01-03,"""store_001""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8,3,1,2022,1,1,1,0,0
2022-01-03,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,3,1,2022,1,1,1,0,0
2022-01-03,"""store_001""","""SPRT_002""","""Sports""",1,49,0.2,39.2,26.95,12.25,3,1,2022,1,1,1,0,0


In [9]:
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2022-01-02,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
1,2022-01-02,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
2,2022-01-02,store_002,HOME_001,Home,1,79,0.2,63.2,55.3,7.9,2022,1,2
3,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
4,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
5,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
6,2022-01-03,store_001,CLTH_003,Clothing,1,149,0.2,119.2,89.4,29.8,2022,1,3
7,2022-01-03,store_001,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,3
8,2022-01-03,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,3
9,2022-01-03,store_001,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3


In [None]:
temp_pandas = temp_pandas.copy()
windows = app_config.features.rolling_features["windows"]
functions = app_config.features.rolling_features["functions"]

if group_cols:
    for window in windows:
        for func in functions:
            col_name = f"{target_col}_rolling_{window}_{func}"
            df[col_name] = df.groupby(group_cols)[target_col].transform(lambda x: x.rolling(window, min_periods=1).agg(func))

In [10]:
from src.config import app_config

In [None]:
def create_interaction_features(df: pd.DataFrame, categorical_cols: list[str]) -> pd.DataFrame:
    df = df.copy()

    for i, col1 in enumerate(categorical_cols):
        for col2 in categorical_cols[i + 1 :]:
            df[f"{col1}_{col2}_interaction"] = df[col1].astype(str) + "_" + df[col2].astype(str)

    return df

In [12]:
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2022-01-02,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
1,2022-01-02,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
2,2022-01-02,store_002,HOME_001,Home,1,79,0.2,63.2,55.3,7.9,2022,1,2
3,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
4,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2
5,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2
6,2022-01-03,store_001,CLTH_003,Clothing,1,149,0.2,119.2,89.4,29.8,2022,1,3
7,2022-01-03,store_001,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,3
8,2022-01-03,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,3
9,2022-01-03,store_001,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3


In [13]:
create_interaction_features(temp_pandas, categorical_cols=["store_id", "product_id"])

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day,store_id_product_id_interaction
0,2022-01-02,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2,store_001_SPRT_001
1,2022-01-02,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2,store_001_SPRT_002
2,2022-01-02,store_002,HOME_001,Home,1,79,0.2,63.2,55.3,7.9,2022,1,2,store_002_HOME_001
3,2022-01-02,store_002,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2,store_002_SPRT_001
4,2022-01-02,store_002,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2022,1,2,store_002_SPRT_002
5,2022-01-02,store_003,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,2,store_003_SPRT_001
6,2022-01-03,store_001,CLTH_003,Clothing,1,149,0.2,119.2,89.4,29.8,2022,1,3,store_001_CLTH_003
7,2022-01-03,store_001,HOME_005,Home,1,39,0.2,31.2,23.4,7.8,2022,1,3,store_001_HOME_005
8,2022-01-03,store_001,SPRT_001,Sports,1,29,0.2,23.2,13.05,10.15,2022,1,3,store_001_SPRT_001
9,2022-01-03,store_001,SPRT_002,Sports,1,49,0.2,39.2,26.95,12.25,2022,1,3,store_001_SPRT_002


In [None]:
def create_lag_features(df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None) -> pl.DataFrame:
    df = df.clone()
    lag_values = app_config.features.lag_features

    if group_cols:
        for lag in lag_values:
            df = df.with_columns(pl.col(target_col).shift(lag).over(group_cols).alias(f"{target_col}_lag_{lag}"))
    else:
        for lag in lag_values:
            df = df.with_columns(pl.col(target_col).shift(lag).alias(f"{target_col}_lag_{lag}"))

    print(f"Created {len(lag_values)} lag features")
    return df


def create_rolling_features(df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None) -> pl.DataFrame:
    df = df.clone()
    windows = app_config.features.rolling_features["windows"]
    functions = app_config.features.rolling_features["functions"]

    if group_cols:
        for window in windows:
            for func in functions:
                col_name: str = f"{target_col}_rolling_{window}_{func}"
                if func == "mean":
                    df = df.with_columns(
                        pl.col(target_col).rolling_mean(window, min_samples=1).over(group_cols).alias(col_name)
                    )
                if func == "std":
                    df = df.with_columns(
                        pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                    )
                if func == "std":
                    df = df.with_columns(
                        pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                    )
                if func == "min":
                    df = df.with_columns(
                        pl.col(target_col).rolling_min(window, min_samples=1).over(group_cols).alias(col_name)
                    )
                if func == "max":
                    df = df.with_columns(
                        pl.col(target_col).rolling_max(window, min_samples=1).over(group_cols).alias(col_name)
                    )
    else:
        for window in windows:
            for func in functions:
                if func == "mean":
                    df = df.with_columns(pl.col(target_col).rolling_mean(window, min_samples=1).alias(col_name))
                if func == "std":
                    df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                if func == "std":
                    df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                if func == "min":
                    df = df.with_columns(pl.col(target_col).rolling_min(window, min_samples=1).alias(col_name))
                if func == "max":
                    df = df.with_columns(pl.col(target_col).rolling_max(window, min_samples=1).alias(col_name))
    return df


def create_interaction_features(df: pl.DataFrame, categorical_cols: list[str]) -> pl.DataFrame:
    df = df.clone()

    for i, col1 in enumerate(categorical_cols):
        for col2 in categorical_cols[i + 1 :]:
            df = df.with_columns(
                (pl.col(col1).cast(pl.Utf8) + "_" + pl.col(col2).cast(pl.Utf8)).alias(f"{col1}_{col2}_interaction")
            )

    return df

In [15]:
create_interaction_features(temp_df, categorical_cols=["store_id", "product_id"])

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday,store_id_product_id_interaction
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8,str
2022-01-02,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0,"""store_001_SPRT_001"""
2022-01-02,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0,"""store_001_SPRT_002"""
2022-01-02,"""store_002""","""HOME_001""","""Home""",1,79,0.2,63.2,55.3,7.9,2,1,2022,7,1,52,1,0,"""store_002_HOME_001"""
2022-01-02,"""store_002""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0,"""store_002_SPRT_001"""
2022-01-02,"""store_002""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,2,1,2022,7,1,52,1,0,"""store_002_SPRT_002"""
2022-01-02,"""store_003""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,2,1,2022,7,1,52,1,0,"""store_003_SPRT_001"""
2022-01-03,"""store_001""","""CLTH_003""","""Clothing""",1,149,0.2,119.2,89.4,29.8,3,1,2022,1,1,1,0,0,"""store_001_CLTH_003"""
2022-01-03,"""store_001""","""HOME_005""","""Home""",1,39,0.2,31.2,23.4,7.8,3,1,2022,1,1,1,0,0,"""store_001_HOME_005"""
2022-01-03,"""store_001""","""SPRT_001""","""Sports""",1,29,0.2,23.2,13.05,10.15,3,1,2022,1,1,1,0,0,"""store_001_SPRT_001"""
2022-01-03,"""store_001""","""SPRT_002""","""Sports""",1,49,0.2,39.2,26.95,12.25,3,1,2022,1,1,1,0,0,"""store_001_SPRT_002"""


In [None]:
result_df = create_lag_features(temp_df, target_col="revenue", group_cols=["store_id"])
result_df

In [None]:
console.print(app_config.features.lag_features)

In [None]:
class FeatureEngineer:
    def __init__(self) -> None:
        self.feature_config = app_config.features
        self.validation_config = app_config.validation

    def create_date_features(self, df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
        """Create date features from a date column."""
        date_features: list[str] = self.feature_config.date_features
        df = df.clone()

        # Convert to datetime
        df = df.with_columns(pl.col(date_col).str.strptime(pl.Date, fmt="%Y-%m-%d"))
        if "date" in date_features:
            return df.with_columns(
                pl.col("date").dt.day().alias("day"),
                pl.col("date").dt.month().alias("month"),
                pl.col("date").dt.year().alias("year"),
                pl.col("date").dt.weekday().alias("day_of_week"),
                pl.col("date").dt.quarter().alias("quarter"),
                pl.col("date").dt.week().alias("week_of_year"),
                (pl.col("date").dt.weekday() > 5).alias("is_weekend").cast(pl.Int8),
                pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
            )
        return df

    def create_lag_features(self, df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None) -> pl.DataFrame:
        df = df.clone()
        lag_values = self.feature_config.lag_features

        if group_cols:
            for lag in lag_values:
                df = df.with_columns(pl.col(target_col).shift(lag).over(group_cols).alias(f"{target_col}_lag_{lag}"))
        else:
            for lag in lag_values:
                df = df.with_columns(pl.col(target_col).shift(lag).alias(f"{target_col}_lag_{lag}"))

        print(f"Created {len(lag_values)} lag features")
        return df

    def create_rolling_features(
        self, df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None
    ) -> pl.DataFrame:
        df = df.clone()
        windows = self.feature_config.rolling_features["windows"]
        functions = self.feature_config.rolling_features["functions"]

        if group_cols:
            for window in windows:
                for func in functions:
                    col_name: str = f"{target_col}_rolling_{window}_{func}"
                    if func == "mean":
                        df = df.with_columns(
                            pl.col(target_col).rolling_mean(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "std":
                        df = df.with_columns(
                            pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "std":
                        df = df.with_columns(
                            pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "min":
                        df = df.with_columns(
                            pl.col(target_col).rolling_min(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "max":
                        df = df.with_columns(
                            pl.col(target_col).rolling_max(window, min_samples=1).over(group_cols).alias(col_name)
                        )
        else:
            for window in windows:
                for func in functions:
                    if func == "mean":
                        df = df.with_columns(pl.col(target_col).rolling_mean(window, min_samples=1).alias(col_name))
                    if func == "std":
                        df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                    if func == "std":
                        df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                    if func == "min":
                        df = df.with_columns(pl.col(target_col).rolling_min(window, min_samples=1).alias(col_name))
                    if func == "max":
                        df = df.with_columns(pl.col(target_col).rolling_max(window, min_samples=1).alias(col_name))
        return df

    def create_interaction_features(self, df: pl.DataFrame, categorical_cols: list[str]) -> pl.DataFrame:
        df = df.clone()

        for i, col1 in enumerate(categorical_cols):
            for col2 in categorical_cols[i + 1 :]:
                df = df.with_columns(
                    (pl.col(col1).cast(pl.Utf8) + "_" + pl.col(col2).cast(pl.Utf8)).alias(f"{col1}_{col2}_interaction")
                )

        return df


In [None]:
set(app_config.features.date_features).issubset(temp_df.columns)

In [None]:
app_config.features.date_features