In [1]:
import warnings
from typing import Any

# Standard imports
import numpy as np
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/End-to-end-Sale-Forecasting


In [4]:
import httpx

url: str = "https://jsonplaceholder.typicode.com/posts"

response = httpx.get(url, timeout=10)
response.raise_for_status()  # Raise an error for bad responses
console.print(response.json()[:3], style="info")

In [5]:
from src.utilities.data_gen import RealisticSalesDataGenerator

gen_data = RealisticSalesDataGenerator(start_date="2025-08-29", end_date="2025-09-04", seed=123)
output_dict: dict[str, Any] = gen_data.generate_sales_data(output_dir="../data/sales_data")
output_dict

2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-08-29
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-08-30
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-08-31
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-09-01
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-09-02
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-09-03
2025-09-02 20:15:17 - data_generation - [INFO] - Generating data for 2025-09-04
2025-09-02 20:15:17 - data_generation - [INFO] - Generated 15 files
2025-09-02 20:15:17 - data_generation - [INFO] - Sales files: 5
2025-09-02 20:15:17 - data_generation - [INFO] - Output directory: ../data/sales_data


{'sales': ['../data/sales_data/sales/year=2025/month=08/day=30/sales_2025-08-30.parquet',
  '../data/sales_data/sales/year=2025/month=08/day=31/sales_2025-08-31.parquet',
  '../data/sales_data/sales/year=2025/month=09/day=01/sales_2025-09-01.parquet',
  '../data/sales_data/sales/year=2025/month=09/day=02/sales_2025-09-02.parquet',
  '../data/sales_data/sales/year=2025/month=09/day=03/sales_2025-09-03.parquet'],
 'inventory': ['../data/sales_data/inventory/year=2025/week=35/inventory_2025-08-31.parquet'],
 'customer_traffic': ['../data/sales_data/customer_traffic/year=2025/month=08/day=29/traffic_2025-08-29.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=08/day=30/traffic_2025-08-30.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=08/day=31/traffic_2025-08-31.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=01/traffic_2025-09-01.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=02/traffic_2025-09-02.parquet',


In [6]:
total_files = sum(len(paths) for paths in output_dict.values())
total_files

15

In [7]:
temp_pandas: pd.DataFrame = pd.read_parquet(output_dict["sales"][:3])
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2025-08-30,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,30
1,2025-08-30,store_001,SPRT_005,Sports,1,89,0.0,89.0,57.85,31.15,2025,8,30
2,2025-08-30,store_002,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,30
3,2025-08-31,store_001,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31
4,2025-08-31,store_001,CLTH_004,Clothing,1,89,0.0,89.0,46.28,42.72,2025,8,31
5,2025-08-31,store_001,HOME_002,Home,1,49,0.0,49.0,31.85,17.15,2025,8,31
6,2025-08-31,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,31
7,2025-08-31,store_001,SPRT_001,Sports,1,29,0.0,29.0,13.05,15.95,2025,8,31
8,2025-08-31,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2025,8,31
9,2025-09-01,store_002,CLTH_002,Clothing,1,79,0.15,67.15,43.45,23.7,2025,9,1


In [8]:
temp_df: pl.DataFrame = pl.read_parquet(output_dict["sales"][:3])
temp_df = temp_df.with_columns(pl.col("date").cast(pl.Date).alias("date"))
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit
date,str,str,str,i64,i64,f64,f64,f64,f64
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2025-09-01,"""store_002""","""CLTH_002""","""Clothing""",1,79,0.15,67.15,43.45,23.7


In [9]:
import holidays

us_holidays = holidays.UnitedStates()

temp_df = temp_df.with_columns(
    pl.col("date").dt.day().alias("day"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.year().alias("year"),
    (pl.col("date").dt.weekday() - 1).alias("day_of_week"),
    pl.col("date").dt.quarter().alias("quarter"),
    pl.col("date").dt.week().alias("week_of_year"),
    ((pl.col("date").dt.weekday() - 1) >= 5).alias("is_weekend").cast(pl.Int8),
    pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
)
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,30,8,2025,5,3,35,1,0
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15,30,8,2025,5,3,35,1,0
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,30,8,2025,5,3,35,1,0
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,31,8,2025,6,3,35,1,0
2025-09-01,"""store_002""","""CLTH_002""","""Clothing""",1,79,0.15,67.15,43.45,23.7,1,9,2025,0,3,36,0,1


In [10]:
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2025-08-30,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,30
1,2025-08-30,store_001,SPRT_005,Sports,1,89,0.0,89.0,57.85,31.15,2025,8,30
2,2025-08-30,store_002,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,30
3,2025-08-31,store_001,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31
4,2025-08-31,store_001,CLTH_004,Clothing,1,89,0.0,89.0,46.28,42.72,2025,8,31
5,2025-08-31,store_001,HOME_002,Home,1,49,0.0,49.0,31.85,17.15,2025,8,31
6,2025-08-31,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,31
7,2025-08-31,store_001,SPRT_001,Sports,1,29,0.0,29.0,13.05,15.95,2025,8,31
8,2025-08-31,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2025,8,31
9,2025-09-01,store_002,CLTH_002,Clothing,1,79,0.15,67.15,43.45,23.7,2025,9,1


In [12]:
from src.config import app_config

In [14]:
import polars.selectors as cs

n_cols: list[str] = temp_df.select(cs.numeric()).columns
# for col in n_cols:
#     if temp_df[col].is_null().any():
#         console.print(f"Column '{col}' has missing values.", style="warning")
#     else:
#         console.print(f"Column '{col}' has no missing values.", style="success")

In [15]:
import polars.selectors as cs
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder


class FeatureEngineer:
    def __init__(self) -> None:
        self.feature_config = app_config.features
        self.validation_config = app_config.validation

    def create_date_features(self, df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
        """Create date features from a date column.
        Note
        ----
        Polars by default uses a non-zero-based index for date components.
        """
        date_features: list[str] = self.feature_config.date_features
        df = df.clone()

        # Convert to datetime
        try:
            df = df.with_columns(pl.col("date").cast(pl.Date).alias("date"))
        except Exception as e:
            print(f"Error converting {date_col} to datetime: {e}")
            pass

        if "date" in date_features:
            return df.with_columns(
                pl.col("date").dt.day().alias("day"),
                pl.col("date").dt.month().alias("month"),
                pl.col("date").dt.year().alias("year"),
                (pl.col("date").dt.weekday() - 1).alias("day_of_week"),
                pl.col("date").dt.weekday().alias("day_of_week"),
                pl.col("date").dt.quarter().alias("quarter"),
                pl.col("date").dt.week().alias("week_of_year"),
                ((pl.col("date").dt.weekday() - 1) >= 5).alias("is_weekend").cast(pl.Int8),
                pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
            )
        return df

    def create_lag_features(self, df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None) -> pl.DataFrame:
        df = df.clone()
        lag_values = self.feature_config.lag_features

        if group_cols:
            for lag in lag_values:
                df = df.with_columns(pl.col(target_col).shift(lag).over(group_cols).alias(f"{target_col}_lag_{lag}"))
        else:
            for lag in lag_values:
                df = df.with_columns(pl.col(target_col).shift(lag).alias(f"{target_col}_lag_{lag}"))

        print(f"Created {len(lag_values)} lag features")
        return df

    def create_rolling_features(
        self, df: pl.DataFrame, target_col: str, group_cols: list[str] | None = None
    ) -> pl.DataFrame:
        df = df.clone()
        windows = self.feature_config.rolling_features["windows"]
        functions = self.feature_config.rolling_features["functions"]

        if group_cols:
            for window in windows:
                for func in functions:
                    col_name: str = f"{target_col}_rolling_{window}_{func}"
                    if func == "mean":
                        df = df.with_columns(
                            pl.col(target_col).rolling_mean(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "std":
                        df = df.with_columns(
                            pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "std":
                        df = df.with_columns(
                            pl.col(target_col).rolling_std(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "min":
                        df = df.with_columns(
                            pl.col(target_col).rolling_min(window, min_samples=1).over(group_cols).alias(col_name)
                        )
                    if func == "max":
                        df = df.with_columns(
                            pl.col(target_col).rolling_max(window, min_samples=1).over(group_cols).alias(col_name)
                        )
        else:
            for window in windows:
                for func in functions:
                    col_name = f"{target_col}_rolling_{window}_{func}"
                    if func == "mean":
                        df = df.with_columns(pl.col(target_col).rolling_mean(window, min_samples=1).alias(col_name))
                    if func == "std":
                        df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                    if func == "std":
                        df = df.with_columns(pl.col(target_col).rolling_std(window, min_samples=1).alias(col_name))
                    if func == "min":
                        df = df.with_columns(pl.col(target_col).rolling_min(window, min_samples=1).alias(col_name))
                    if func == "max":
                        df = df.with_columns(pl.col(target_col).rolling_max(window, min_samples=1).alias(col_name))
        return df

    def create_interaction_features(self, df: pl.DataFrame, categorical_cols: list[str]) -> pl.DataFrame:
        df = df.clone()

        for i, col1 in enumerate(categorical_cols):
            for col2 in categorical_cols[i + 1 :]:
                df = df.with_columns(
                    (pl.col(col1).cast(pl.Utf8) + "_" + pl.col(col2).cast(pl.Utf8)).alias(f"{col1}_{col2}_interaction")
                )

        return df

    def create_cyclical_features(self, df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
        df = df.clone()

        return df.with_columns(
            # month (convert 1-12 to 0-11 for proper cyclical encoding)
            pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
            pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
            # day (Retain original values; 1-31)
            pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * x / 31)).alias("day_sin"),
            pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * x / 31)).alias("day_cos"),
            # day of week (convert 1-7 to 0-6 for proper cyclical encoding)
            pl.col(date_col).dt.weekday().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7)).alias("day_of_week_sin"),
            pl.col(date_col).dt.weekday().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7)).alias("day_of_week_cos"),
        )

    def handle_missing_values(self, df: pl.DataFrame) -> pl.DataFrame:
        numeric_columns: list[str] = df.select(cs.numeric()).columns

        for col in numeric_columns:
            if df[col].is_null().any():
                if "lag" in col or "rolling" in col:
                    # Forward fill then backward fill
                    df = df.with_columns(pl.col(col).fill_null(strategy="forward").fill_null(strategy="backward"))
                else:
                    # For other columns, just use average
                    df = df.with_columns(pl.col(col).fill_null(strategy="mean"))
        return df

    def create_all_features(
        self,
        df: pl.DataFrame,
        target_col: str = "sales",
        date_col: str = "date",
        group_cols: list[str] | None = None,
        categorical_cols: list[str] | None = None,
    ) -> pl.DataFrame:
        print("Starting feature engineering pipeline")

        if group_cols:
            df = df.sort(by=group_cols + [date_col])
        else:
            df = df.sort(by=[date_col])

        # Create date features
        df = self.create_date_features(df, date_col=date_col)

        # Create lag features
        df = self.create_lag_features(df, target_col=target_col, group_cols=group_cols)

        # Create rolling features
        df = self.create_rolling_features(df, target_col=target_col, group_cols=group_cols)

        # Create cyclical features
        df = self.create_cyclical_features(df, date_col=date_col)

        # Create interaction features
        if categorical_cols:
            df = self.create_interaction_features(df, categorical_cols=categorical_cols)

        # Handle missing values
        df = self.handle_missing_values(df)

        print(f"Feature engineering pipeline completed. {len(df.columns)!r} total features.")

        return df

    def select_features(self, df: pl.DataFrame, target_col: str, importance_threshold: float = 0.001) -> list[str]:
        X = df.drop(["date", target_col])
        y = df[target_col]
        cat_cols: list[str] = X.select(cs.string()).columns

        label_encoders: dict[str, LabelEncoder] = {}

        # Encode categorical variables
        for col in cat_cols:
            le = LabelEncoder()
            values = le.fit_transform(X[col])
            X = X.with_columns(pl.Series(col, values=values, dtype=pl.Int8))
            label_encoders[col] = le

        # Train the model
        rf = RandomForestRegressor(n_estimators=50, random_state=42)
        rf.fit(X, y)

        # Get feature importances
        importances = pl.DataFrame({"feature": X.columns, "importance": rf.feature_importances_}).sort(
            "importance", descending=True
        )
        # Select features based on importance
        selected_features: list[str] = importances.filter(pl.col("importance") > importance_threshold)["feature"].to_list()
        print(f"Selected features: {len(selected_features)} out of {len(X.columns)}")

        return selected_features

    def create_target_encoding(
        self,
        df: pl.DataFrame,
        target_col: str,
        categorical_cols: list[str],
        smoothing: float = 1.0,
    ) -> pl.DataFrame:
        df = df.clone()

        for col_name in categorical_cols:
            # Calculate mean target for each category
            mean_target = df.group_by(col_name).agg(pl.col(target_col).mean())
            global_mean = df[target_col].mean()
            # Calculate count for each category
            count = df[col_name].value_counts()

            smooth_mean: dict = {}
            for cat in count[col_name]:
                n = count.filter(pl.col(col_name).eq(cat))["count"].item()
                smooth_mean[cat] = (
                    n * mean_target.filter(pl.col(col_name).eq(cat))[target_col].item() + smoothing * global_mean
                ) / (n + smoothing)
            df = df.with_columns(
                pl.col(col_name)
                .map_elements(lambda x: smooth_mean.get(x, global_mean))  # noqa: B023
                .alias(f"{col_name}_target_encoded")
            )
        print(f"Created target encoding for {len(categorical_cols)!r} categorical features")

        return df

In [16]:
df: pl.DataFrame = pl.DataFrame(
    data={
        "id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "Bob"],
        "role": ["Engineer", "Manager", "Engineer", "Manager"],
        "skill": ["Python", "Leadership", "Python", "Management"],
        "experience": [5, 2, 3, 3],
        "age": [30, 40, 35, 34],
        "target": [1, 0, 1, 1],
    }
)

df

id,name,role,skill,experience,age,target
i64,str,str,str,i64,i64,i64
1,"""Alice""","""Engineer""","""Python""",5,30,1
2,"""Bob""","""Manager""","""Leadership""",2,40,0
3,"""Charlie""","""Engineer""","""Python""",3,35,1
4,"""Bob""","""Manager""","""Management""",3,34,1


In [17]:
counts = df["name"].value_counts()
mean_target = df.group_by("name").agg(pl.col("target").mean())
display(mean_target)
display(counts["name"])
for row in counts["name"]:
    print(counts.filter(pl.col("name").eq(row))["count"].item())

counts.filter(pl.col("name").eq("Alice"))["count"].item()

name,target
str,f64
"""Charlie""",1.0
"""Alice""",1.0
"""Bob""",0.5


name
str
"""Bob"""
"""Alice"""
"""Charlie"""


2
1
1


1

In [None]:
temp_df

In [18]:
data_gen = FeatureEngineer()
# Fix this!
temp_all_df = data_gen.create_all_features(temp_df, date_col="date", target_col="revenue")
data_gen.create_target_encoding(
    temp_all_df,
    target_col="revenue",
    categorical_cols=["store_id", "category"],
    smoothing=1.0,
)

Starting feature engineering pipeline
Created 7 lag features
Feature engineering pipeline completed. 51 total features.
Created target encoding for 2 categorical features


date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday,revenue_lag_1,revenue_lag_2,revenue_lag_3,revenue_lag_7,revenue_lag_14,revenue_lag_21,revenue_lag_30,revenue_rolling_3_mean,revenue_rolling_3_std,revenue_rolling_3_min,revenue_rolling_3_max,revenue_rolling_7_mean,revenue_rolling_7_std,revenue_rolling_7_min,revenue_rolling_7_max,revenue_rolling_14_mean,revenue_rolling_14_std,revenue_rolling_14_min,revenue_rolling_14_max,revenue_rolling_21_mean,revenue_rolling_21_std,revenue_rolling_21_min,revenue_rolling_21_max,revenue_rolling_30_mean,revenue_rolling_30_std,revenue_rolling_30_min,revenue_rolling_30_max,month_sin,month_cos,day_sin,day_cos,day_of_week_sin,day_of_week_cos,store_id_target_encoded,category_target_encoded
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,30,8,2025,5,3,35,1,0,39.0,39.0,39.0,39.0,39.0,,,39.0,35.355339,39.0,39.0,39.0,35.355339,39.0,39.0,39.0,35.355339,39.0,39.0,39.0,35.355339,39.0,39.0,39.0,35.355339,39.0,39.0,-0.5,-0.866025,-0.201299,0.97953,-0.974928,-0.222521,51.353086,49.422778
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15,30,8,2025,5,3,35,1,0,39.0,39.0,39.0,39.0,39.0,,,64.0,35.355339,39.0,89.0,64.0,35.355339,39.0,89.0,64.0,35.355339,39.0,89.0,64.0,35.355339,39.0,89.0,64.0,35.355339,39.0,89.0,-0.5,-0.866025,-0.201299,0.97953,-0.974928,-0.222521,51.353086,53.235556
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,30,8,2025,5,3,35,1,0,89.0,39.0,39.0,39.0,39.0,,,52.333333,32.145503,29.0,89.0,52.333333,32.145503,29.0,89.0,52.333333,32.145503,29.0,89.0,52.333333,32.145503,29.0,89.0,52.333333,32.145503,29.0,89.0,-0.5,-0.866025,-0.201299,0.97953,-0.974928,-0.222521,52.918254,48.887963
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0,29.0,89.0,39.0,39.0,39.0,,,49.0,34.641016,29.0,89.0,46.5,28.722813,29.0,89.0,46.5,28.722813,29.0,89.0,46.5,28.722813,29.0,89.0,46.5,28.722813,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,48.887963
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72,31,8,2025,6,3,35,1,0,29.0,29.0,89.0,39.0,39.0,,,49.0,34.641016,29.0,89.0,55.0,31.304952,29.0,89.0,55.0,31.304952,29.0,89.0,55.0,31.304952,29.0,89.0,55.0,31.304952,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,48.887963
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15,31,8,2025,6,3,35,1,0,89.0,29.0,29.0,39.0,39.0,,,55.666667,30.550505,29.0,89.0,54.0,28.106939,29.0,89.0,54.0,28.106939,29.0,89.0,54.0,28.106939,29.0,89.0,54.0,28.106939,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,49.422778
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,31,8,2025,6,3,35,1,0,49.0,89.0,29.0,39.0,39.0,,,59.0,26.457513,39.0,89.0,51.857143,26.276914,29.0,89.0,51.857143,26.276914,29.0,89.0,51.857143,26.276914,29.0,89.0,51.857143,26.276914,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,49.422778
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95,31,8,2025,6,3,35,1,0,39.0,49.0,89.0,39.0,39.0,,,39.0,10.0,29.0,49.0,50.428571,27.342623,29.0,89.0,49.0,25.634798,29.0,89.0,49.0,25.634798,29.0,89.0,49.0,25.634798,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,53.235556
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,31,8,2025,6,3,35,1,0,29.0,39.0,49.0,89.0,39.0,,,39.0,10.0,29.0,49.0,44.714286,21.49197,29.0,89.0,49.0,23.979158,29.0,89.0,49.0,23.979158,29.0,89.0,49.0,23.979158,29.0,89.0,-0.5,-0.866025,-2.4493e-16,1.0,-0.781831,0.62349,51.353086,53.235556
2025-09-01,"""store_002""","""CLTH_002""","""Clothing""",1,79,0.15,67.15,43.45,23.7,1,9,2025,0,3,36,0,1,49.0,29.0,39.0,29.0,39.0,,,48.383333,19.082475,29.0,67.15,50.164286,21.679228,29.0,89.0,50.815,23.324951,29.0,89.0,50.815,23.324951,29.0,89.0,50.815,23.324951,29.0,89.0,-0.866025,-0.5,0.201299,0.97953,0.0,1.0,52.918254,48.887963


In [24]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X = df.select(["experience", "age"])
scaled_features = scaler.fit_transform(X)
display(scaled_features)

scaled_df = pl.DataFrame(scaled_features, schema=X.columns)
scaled_df

array([[ 1.6059, -1.3335],
       [-1.1471,  1.4739],
       [-0.2294,  0.0702],
       [-0.2294, -0.2106]])

experience,age
f64,f64
1.60591,-1.333539
-1.147079,1.473911
-0.229416,0.070186
-0.229416,-0.210559


In [36]:
environment:
      - MLFLOW_S3_ENDPOINT_URL=http://minio:9000
      - POSTGRES_HOST=mlflow-db
      - POSTGRES_USER=mlflow
      - POSTGRES_PASSWORD=mlflow
      - POSTGRES_DB=mlflow
      - POSTGRES_PORT=5432
      - AWS_S3_BUCKET=mlflow-artifacts
      - MLFLOW_HOST=0.0.0.0
      - MLFLOW_PORT=5001
      - MLFLOW_DB_URI=postgresql://mlflow:mlflow@mlflow-db:5432/mlflow
      - MLFLOW_ARTIFACT_ROOT=s3://mlflow-artifacts

age
f64
1.473911


In [None]:
import os

In [50]:
# Force localhost configuration and debug
RUNNING_IN_DOCKER = False
DEFAULT_MINIO_HOST = "localhost"  # Force localhost
DEFAULT_MINIO_PORT = "9000"
MINIO_ENDPOINT = f"http://{DEFAULT_MINIO_HOST}:{DEFAULT_MINIO_PORT}"
MLFLOW_URI = "http://localhost:5001"  # This connects to the MLflow server with PostgreSQL backend

AWS_KEY = "minioadmin"
AWS_SECRET = "minioadmin"
AWS_REGION = "us-east-1"
BUCKET = "mlflow-artifacts"

# Set environment variables
os.environ["AWS_ACCESS_KEY_ID"] = AWS_KEY
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET
os.environ["AWS_DEFAULT_REGION"] = AWS_REGION
os.environ["MLFLOW_S3_ENDPOINT_URL"] = MINIO_ENDPOINT

print("=== CONFIGURATION DEBUG ===")
print(f"RUNNING_IN_DOCKER: {RUNNING_IN_DOCKER}")
print(f"DEFAULT_MINIO_HOST: {DEFAULT_MINIO_HOST}")
print(f"MINIO_ENDPOINT: {MINIO_ENDPOINT}")
print(f"MLFLOW_URI: {MLFLOW_URI}")
print(f"AWS_ACCESS_KEY_ID: {AWS_KEY}")
print(f"BUCKET: {BUCKET}")
print(
    f"Environment MLFLOW_S3_ENDPOINT_URL: {os.environ.get('MLFLOW_S3_ENDPOINT_URL', 'NOT SET')}"
)
print("=== END DEBUG ===\n")

=== CONFIGURATION DEBUG ===
RUNNING_IN_DOCKER: False
DEFAULT_MINIO_HOST: localhost
MINIO_ENDPOINT: http://localhost:9000
MLFLOW_URI: http://localhost:5001
AWS_ACCESS_KEY_ID: minioadmin
BUCKET: mlflow-artifacts
Environment MLFLOW_S3_ENDPOINT_URL: http://localhost:9000
=== END DEBUG ===



In [51]:
# Test MLflow server connection and S3 storage
import boto3
from botocore.exceptions import ClientError
import mlflow
import mlflow.sklearn
from sklearn.linear_model import ElasticNet
from sklearn import datasets
import tempfile
import traceback

# 1) Test S3/MinIO connection
print("Testing S3/MinIO connection...")
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET,
    region_name=AWS_REGION,
)

try:
    s3.head_bucket(Bucket=BUCKET)
    print(f"✅ Bucket '{BUCKET}' is reachable")
except ClientError as e:
    print(f"❌ S3/MinIO connection failed: {e}")

# 2) Test MLflow server connection
print(f"\nTesting MLflow server connection to {MLFLOW_URI}...")
mlflow.set_tracking_uri(MLFLOW_URI)
print(f"✅ MLflow tracking URI set to: {mlflow.get_tracking_uri()}")

# 3) Test that MLflow uses PostgreSQL backend (not local files)
try:
    # This should connect to the MLflow server which uses PostgreSQL
    experiments = mlflow.search_experiments()
    print(f"✅ Connected to MLflow server. Found {len(experiments)} experiments.")
    print("✅ This confirms MLflow is using the PostgreSQL backend, not local files.")
except Exception as e:
    print(f"❌ Failed to connect to MLflow server: {e}")

print("\n" + "=" * 50)
print("IMPORTANT: If MLflow server is using PostgreSQL correctly,")
print("experiments and runs will be stored in the database,")
print("and artifacts will be stored in MinIO/S3.")
print("Local 'mlruns' folders should NOT be created.")
print("=" * 50)

Testing S3/MinIO connection...
✅ Bucket 'mlflow-artifacts' is reachable

Testing MLflow server connection to http://localhost:5001...
✅ MLflow tracking URI set to: http://localhost:5001
✅ Connected to MLflow server. Found 1 experiments.
✅ This confirms MLflow is using the PostgreSQL backend, not local files.

IMPORTANT: If MLflow server is using PostgreSQL correctly,
experiments and runs will be stored in the database,
and artifacts will be stored in MinIO/S3.
Local 'mlruns' folders should NOT be created.


In [52]:
import os
import tempfile
import traceback

import boto3
from botocore.exceptions import ClientError

import mlflow
import mlflow.sklearn
from sklearn.linear_model import ElasticNet
from sklearn import datasets

# Use the configuration from the previous cells
print("Using configuration from previous cells...")
print(f"MINIO_ENDPOINT: {MINIO_ENDPOINT}")
print(f"MLFLOW_URI: {MLFLOW_URI}")
print(f"BUCKET: {BUCKET}")

# 1) Quick check: can we talk to MinIO with these credentials?
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET,
    region_name=AWS_REGION,
)

try:
    # head_bucket will confirm bucket exists and credentials are valid for it
    s3.head_bucket(Bucket=BUCKET)
    print(f"✅ Bucket '{BUCKET}' reachable — credentials & endpoint look OK.")
except ClientError as e:
    code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
    print("❌ S3/MinIO access test failed:", code)
    print("ClientError details:", str(e))
    raise

# 2) Point MLflow client at the tracking server (which uses PostgreSQL backend)
mlflow.set_tracking_uri(MLFLOW_URI)
print("✅ Set MLflow tracking URI ->", mlflow.get_tracking_uri())
print("✅ MLflow will use PostgreSQL backend via the server, NOT local files")

# 3) Run a quick experiment and log an artifact + model
try:
    mlflow.set_experiment("notebook_quick_test")
    X, y = datasets.load_diabetes(return_X_y=True)
    model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    model.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.log_param("alpha", 0.1)
        mlflow.log_param("l1_ratio", 0.5)
        mlflow.log_metric("dummy_score", model.score(X, y))

        # create a small artifact file and upload
        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as tmp:
            tmp.write("mlflow artifact test")
            tmp_path = tmp.name

        mlflow.log_artifact(tmp_path, artifact_path="test_artifacts")
        mlflow.sklearn.log_model(model, "model")

        print("✅ Logged run id:", run.info.run_id)
        print("✅ Experiment id:", run.info.experiment_id)

    print("✅ MLflow logging complete — check the UI and MinIO for artifact/model.")
    print("✅ Data stored in PostgreSQL database, artifacts in MinIO S3")
except ClientError as e:
    # boto3 ClientError can surface during artifact upload
    print("❌ Boto3 ClientError during MLflow operations:", e)
    print(traceback.format_exc())
    raise
except Exception:
    print("❌ Unexpected error during MLflow logging:")
    print(traceback.format_exc())
    raise

Using configuration from previous cells...
MINIO_ENDPOINT: http://localhost:9000
MLFLOW_URI: http://localhost:5001
BUCKET: mlflow-artifacts
✅ Bucket 'mlflow-artifacts' reachable — credentials & endpoint look OK.
✅ Set MLflow tracking URI -> http://localhost:5001
✅ MLflow will use PostgreSQL backend via the server, NOT local files




✅ Logged run id: 29a9a2cc39f04e90a2937865e6cdcfe6
✅ Experiment id: 2
🏃 View run unruly-hare-268 at: http://localhost:5001/#/experiments/2/runs/29a9a2cc39f04e90a2937865e6cdcfe6
🧪 View experiment at: http://localhost:5001/#/experiments/2
✅ MLflow logging complete — check the UI and MinIO for artifact/model.
✅ Data stored in PostgreSQL database, artifacts in MinIO S3


In [None]:
def create_cyclical_features(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    df = df.clone()

    return df.with_columns(
        # month (convert 1-12 to 0-11 for proper cyclical encoding)
        pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
        pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
        # day (Retain original values; 1-31)
        pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * x / 31)).alias("day_sin"),
        pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * x / 31)).alias("day_cos"),
        # day of week (convert 1-7 to 0-6 for proper cyclical encoding)
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7)).alias("day_of_week_sin"),
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7)).alias("day_of_week_cos"),
    )


create_cyclical_features(temp_df, date_col="date")

In [None]:
# Let's examine the cyclical features to see what's wrong
cyclical_result = create_cyclical_features(temp_df, date_col="date")

# Check the cyclical features
cyclical_sample = cyclical_result.select(
    [
        "date",
        "day_of_week",
        "day_of_week_sin",
        "day_of_week_cos",
        "month_sin",
        "month_cos",
    ]
).unique()

print("Cyclical features sample:")
print(cyclical_sample)

print("\nLet's check the day_of_week values and corresponding sin/cos:")
day_check = (
    cyclical_result.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)
print(day_check)

print("\nIssue Analysis:")
print("day_of_week ranges from 1-7 in Polars (Monday=1, Sunday=7)")
print("But for cyclical encoding, we want values from 0 to 2π")
print("Current formula: sin(2π × day_of_week / 7)")
print("This means day 7 gives: sin(2π × 7 / 7) = sin(2π) = 0")
print("And day 1 gives: sin(2π × 1 / 7) = sin(2π/7)")
print("This creates a discontinuity between Sunday (7) and Monday (1)!")

In [None]:
# The issue with day_of_week cyclical encoding:
print("PROBLEM IDENTIFIED:")
print("Polars weekday() returns 1-7 (Monday=1, Sunday=7)")
print("For proper cyclical encoding, we need 0-based indexing")
print()


def create_cyclical_features_corrected(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    df = df.clone()

    return df.with_columns(
        # Month cyclical features (months 1-12, convert to 0-11)
        pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
        pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
        # Day cyclical features (days 1-31, convert to 0-30)
        pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 31)).alias("day_sin"),
        pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 31)).alias("day_cos"),
        # Day of week cyclical features (weekday 1-7, convert to 0-6)
        pl.col(date_col)
        .dt.weekday()
        .map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7))  # CORRECTED: (x-1)
        .alias("day_of_week_sin"),
        pl.col(date_col)
        .dt.weekday()
        .map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7))  # CORRECTED: (x-1)
        .alias("day_of_week_cos"),
    )


# Test the corrected function
print("Testing corrected cyclical features:")
corrected_result = create_cyclical_features_corrected(temp_df, date_col="date")

# Compare day_of_week encoding
comparison = (
    corrected_result.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)

print("\nCorrected day_of_week cyclical encoding:")
print(comparison)

print("\nKey differences:")
print("- Original: sin(2π × day_of_week / 7) where day_of_week ∈ [1,7]")
print("- Corrected: sin(2π × (day_of_week - 1) / 7) where (day_of_week - 1) ∈ [0,6]")
print("- This ensures Monday(1) → 0, Tuesday(2) → 1, ..., Sunday(7) → 6")
print("- Now Sunday(6) and Monday(0) are properly connected in the cycle!")

In [None]:
# Test the corrected function
print("Testing the corrected create_cyclical_features function:")
corrected_cyclical = create_cyclical_features(temp_df, date_col="date")

# Show the cyclical features for each day of week
cyclical_summary = (
    corrected_cyclical.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)

print("\nCorrected cyclical features by day of week:")
print(cyclical_summary)

print("\nVerification:")
print("Monday (1) → sin=0.0, cos=1.0 (start of cycle)")
print("Sunday (7) → sin≈-0.78, cos≈0.62 (connects smoothly back to Monday)")
print("The cyclical encoding now properly represents the weekly cycle!")

In [None]:
-2.4493e-16