In [1]:
import os
import warnings
from typing import Any

import numpy as np
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/End-to-end-Sale-Forecasting


In [4]:
import httpx

url: str = "https://jsonplaceholder.typicode.com/posts"

response = httpx.get(url, timeout=10)
response.raise_for_status()  # Raise an error for bad responses
console.print(response.json()[:3], style="info")

In [None]:
from src.utilities.data_gen import RealisticSalesDataGenerator

gen_data = RealisticSalesDataGenerator(start_date="2025-08-29", end_date="2025-09-04", seed=123)
output_dict: dict[str, Any] = gen_data.generate_sales_data(output_dir="../data/sales_data")
output_dict

2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-08-29
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-08-30
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-08-31
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-09-01
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-09-02
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-09-03
2025-09-03 10:29:47 - data_generation - [INFO] - Generating data for 2025-09-04
2025-09-03 10:29:47 - data_generation - [INFO] - Generated 13 files
2025-09-03 10:29:47 - data_generation - [INFO] - Sales files: 3
2025-09-03 10:29:47 - data_generation - [INFO] - Output directory: ../data/sales_data


{'sales': ['../data/sales_data/sales/year=2025/month=08/day=30/sales_2025-08-30.parquet',
  '../data/sales_data/sales/year=2025/month=08/day=31/sales_2025-08-31.parquet',
  '../data/sales_data/sales/year=2025/month=09/day=02/sales_2025-09-02.parquet'],
 'inventory': ['../data/sales_data/inventory/year=2025/week=35/inventory_2025-08-31.parquet'],
 'customer_traffic': ['../data/sales_data/customer_traffic/year=2025/month=08/day=29/traffic_2025-08-29.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=08/day=30/traffic_2025-08-30.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=08/day=31/traffic_2025-08-31.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=01/traffic_2025-09-01.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=02/traffic_2025-09-02.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=03/traffic_2025-09-03.parquet',
  '../data/sales_data/customer_traffic/year=2025/month=09/day=04/tra

In [6]:
total_files = sum(len(paths) for paths in output_dict.values())
total_files

13

In [7]:
temp_pandas: pd.DataFrame = pd.read_parquet(output_dict["sales"][:3])
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2025-08-30,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,30
1,2025-08-30,store_001,SPRT_005,Sports,1,89,0.0,89.0,57.85,31.15,2025,8,30
2,2025-08-30,store_002,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,30
3,2025-08-31,store_001,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31
4,2025-08-31,store_001,CLTH_004,Clothing,1,89,0.0,89.0,46.28,42.72,2025,8,31
5,2025-08-31,store_001,HOME_002,Home,1,49,0.0,49.0,31.85,17.15,2025,8,31
6,2025-08-31,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,31
7,2025-08-31,store_001,SPRT_001,Sports,1,29,0.0,29.0,13.05,15.95,2025,8,31
8,2025-08-31,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2025,8,31
9,2025-08-31,store_009,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31


In [8]:
temp_df: pl.DataFrame = pl.read_parquet(output_dict["sales"][:3])
temp_df = temp_df.with_columns(pl.col("date").cast(pl.Date).alias("date"))
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit
date,str,str,str,i64,i64,f64,f64,f64,f64
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05
2025-08-31,"""store_009""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5


In [None]:
import holidays

us_holidays = holidays.UnitedStates()

temp_df = temp_df.with_columns(
    pl.col("date").dt.day().alias("day"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.year().alias("year"),
    (pl.col("date").dt.weekday() - 1).alias("day_of_week"),
    pl.col("date").dt.quarter().alias("quarter"),
    pl.col("date").dt.week().alias("week_of_year"),
    ((pl.col("date").dt.weekday() - 1) >= 5).alias("is_weekend").cast(pl.Int8),
    pl.col("date").map_elements(lambda x: x in us_holidays).alias("is_holiday").cast(pl.Int8),
)
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,30,8,2025,5,3,35,1,0
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15,30,8,2025,5,3,35,1,0
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,30,8,2025,5,3,35,1,0
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,31,8,2025,6,3,35,1,0
2025-08-31,"""store_009""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0


In [10]:
temp_pandas

Unnamed: 0,date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,year,month,day
0,2025-08-30,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,30
1,2025-08-30,store_001,SPRT_005,Sports,1,89,0.0,89.0,57.85,31.15,2025,8,30
2,2025-08-30,store_002,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,30
3,2025-08-31,store_001,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31
4,2025-08-31,store_001,CLTH_004,Clothing,1,89,0.0,89.0,46.28,42.72,2025,8,31
5,2025-08-31,store_001,HOME_002,Home,1,49,0.0,49.0,31.85,17.15,2025,8,31
6,2025-08-31,store_001,HOME_005,Home,1,39,0.0,39.0,23.4,15.6,2025,8,31
7,2025-08-31,store_001,SPRT_001,Sports,1,29,0.0,29.0,13.05,15.95,2025,8,31
8,2025-08-31,store_001,SPRT_002,Sports,1,49,0.0,49.0,26.95,22.05,2025,8,31
9,2025-08-31,store_009,CLTH_001,Clothing,1,29,0.0,29.0,14.5,14.5,2025,8,31


In [None]:
from src.config import app_settings

In [12]:
df: pl.DataFrame = pl.DataFrame(
    data={
        "id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "Bob"],
        "role": ["Engineer", "Manager", "Engineer", "Manager"],
        "skill": ["Python", "Leadership", "Python", "Management"],
        "experience": [5, 2, 3, 3],
        "age": [30, 40, 35, 34],
        "target": [1, 0, 1, 1],
    }
)

df

id,name,role,skill,experience,age,target
i64,str,str,str,i64,i64,i64
1,"""Alice""","""Engineer""","""Python""",5,30,1
2,"""Bob""","""Manager""","""Leadership""",2,40,0
3,"""Charlie""","""Engineer""","""Python""",3,35,1
4,"""Bob""","""Manager""","""Management""",3,34,1


In [13]:
counts = df["name"].value_counts()
mean_target = df.group_by("name").agg(pl.col("target").mean())
display(mean_target)
display(counts["name"])
for row in counts["name"]:
    print(counts.filter(pl.col("name").eq(row))["count"].item())

counts.filter(pl.col("name").eq("Alice"))["count"].item()

name,target
str,f64
"""Alice""",1.0
"""Charlie""",1.0
"""Bob""",0.5


name
str
"""Charlie"""
"""Bob"""
"""Alice"""


1
2
1


1

In [14]:
temp_df

date,store_id,product_id,category,quantity_sold,unit_price,discount_percent,revenue,cost,profit,day,month,year,day_of_week,quarter,week_of_year,is_weekend,is_holiday
date,str,str,str,i64,i64,f64,f64,f64,f64,i8,i8,i32,i8,i8,i8,i8,i8
2025-08-30,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,30,8,2025,5,3,35,1,0
2025-08-30,"""store_001""","""SPRT_005""","""Sports""",1,89,0.0,89.0,57.85,31.15,30,8,2025,5,3,35,1,0
2025-08-30,"""store_002""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,30,8,2025,5,3,35,1,0
2025-08-31,"""store_001""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""CLTH_004""","""Clothing""",1,89,0.0,89.0,46.28,42.72,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_002""","""Home""",1,49,0.0,49.0,31.85,17.15,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""HOME_005""","""Home""",1,39,0.0,39.0,23.4,15.6,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_001""","""Sports""",1,29,0.0,29.0,13.05,15.95,31,8,2025,6,3,35,1,0
2025-08-31,"""store_001""","""SPRT_002""","""Sports""",1,49,0.0,49.0,26.95,22.05,31,8,2025,6,3,35,1,0
2025-08-31,"""store_009""","""CLTH_001""","""Clothing""",1,29,0.0,29.0,14.5,14.5,31,8,2025,6,3,35,1,0


### Connect To MLFlow

- Set the `tracking URI` to the MLflow server.
    - Tracking URI requires the MLflow `server address`, `port`, `S3 endpoint URL`, and `S3 credentials`.
    - S3 credentials include `access key`, `secret key`, and `bucket name`.
    - `MinIO` is used as a local S3-compatible storage service.

- Verify the connection by listing experiments.

In [None]:
# Force localhost configuration and debug
RUNNING_IN_DOCKER = False
DEFAULT_MINIO_HOST = app_settings.AWS_S3_HOST if RUNNING_IN_DOCKER else "minio"
DEFAULT_MINIO_PORT = app_settings.AWS_S3_PORT
MINIO_ENDPOINT = app_settings.mlflow_s3_endpoint_url
# This connects to the MLflow server with PostgreSQL backend
MLFLOW_URI = app_settings.mlflow_tracking_uri
AWS_KEY = app_settings.AWS_ACCESS_KEY_ID
AWS_SECRET = app_settings.AWS_SECRET_ACCESS_KEY.get_secret_value()
AWS_REGION = app_settings.AWS_DEFAULT_REGION
BUCKET = app_settings.AWS_S3_BUCKET

# Set environment variables
os.environ["AWS_ACCESS_KEY_ID"] = app_settings.AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET
os.environ["AWS_DEFAULT_REGION"] = AWS_REGION
os.environ["MLFLOW_S3_ENDPOINT_URL"] = MINIO_ENDPOINT

print("=== CONFIGURATION DEBUG ===")
print(f"RUNNING_IN_DOCKER: {RUNNING_IN_DOCKER}")
print(f"DEFAULT_MINIO_HOST: {DEFAULT_MINIO_HOST}")
print(f"MINIO_ENDPOINT: {MINIO_ENDPOINT}")
print(f"MLFLOW_URI: {MLFLOW_URI}")
print(f"AWS_ACCESS_KEY_ID: {AWS_KEY}")
print(f"BUCKET: {BUCKET}")
print(f"Environment MLFLOW_S3_ENDPOINT_URL: {MINIO_ENDPOINT}")
print("=== END CONFIGURATION DEBUG ===\n")

=== CONFIGURATION DEBUG ===
RUNNING_IN_DOCKER: False
DEFAULT_MINIO_HOST: minio
MINIO_ENDPOINT: http://localhost:9000
MLFLOW_URI: http://localhost:5001
AWS_ACCESS_KEY_ID: minioadmin
BUCKET: mlflow-artifacts
Environment MLFLOW_S3_ENDPOINT_URL: http://localhost:9000
=== END CONFIGURATION DEBUG ===



In [None]:
# Test MLflow server connection and S3 storage
import tempfile
import traceback

import boto3
import mlflow
from botocore.exceptions import ClientError

# 1) Test S3/MinIO connection
print("Testing S3/MinIO connection...")
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET,
    region_name=AWS_REGION,
)

try:
    s3.head_bucket(Bucket=BUCKET)
    print(f"✅ Bucket '{BUCKET}' is reachable")
except ClientError as e:
    print(f"❌ S3/MinIO connection failed: {e}")

# 2) Test MLflow server connection
print(f"\nTesting MLflow server connection to {MLFLOW_URI}...")
mlflow.set_tracking_uri(MLFLOW_URI)
print(f"✅ MLflow tracking URI set to: {mlflow.get_tracking_uri()}")

# 3) Test that MLflow uses PostgreSQL backend (not local files)
try:
    # This should connect to the MLflow server which uses PostgreSQL
    experiments = mlflow.search_experiments()
    print(f"✅ Connected to MLflow server. Found {len(experiments)} experiments.")
    print("✅ This confirms MLflow is using the PostgreSQL backend, not local files.")
except Exception as e:
    print(f"❌ Failed to connect to MLflow server: {e}")

print("\n" + "=" * 50)
print("IMPORTANT: If MLflow server is using PostgreSQL correctly,")
print("experiments and runs will be stored in the database,")
print("and artifacts will be stored in MinIO/S3.")
print("Local 'mlruns' folders should NOT be created.")
print("=" * 50)

Testing S3/MinIO connection...
✅ Bucket 'mlflow-artifacts' is reachable

Testing MLflow server connection to http://localhost:5001...
✅ MLflow tracking URI set to: http://localhost:5001
✅ Connected to MLflow server. Found 1 experiments.
✅ This confirms MLflow is using the PostgreSQL backend, not local files.

IMPORTANT: If MLflow server is using PostgreSQL correctly,
experiments and runs will be stored in the database,
and artifacts will be stored in MinIO/S3.
Local 'mlruns' folders should NOT be created.


In [23]:
import mlflow
import mlflow.sklearn
from botocore.exceptions import ClientError
from sklearn import datasets
from sklearn.linear_model import ElasticNet

try:
    mlflow.set_experiment("notebook_quick_test")
    X, y = datasets.load_diabetes(return_X_y=True)
    model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    model.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.log_param("alpha", 0.1)
        mlflow.log_param("l1_ratio", 0.5)
        mlflow.log_metric("dummy_score", model.score(X, y))

        # Create a small artifact file and upload
        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as tmp:
            tmp.write("mlflow artifact test")
            tmp_path = tmp.name

        mlflow.log_artifact(tmp_path, artifact_path="test_artifacts")
        mlflow.sklearn.log_model(model, "model", input_example=X[:2].tolist())

        # Remove temp file after logging
        os.remove(tmp_path)

        print("✅ Logged run id:", run.info.run_id)
        print("✅ Experiment id:", run.info.experiment_id)

    print("✅ MLflow logging complete — check the UI and MinIO for artifact/model.")
    print("✅ Data stored in PostgreSQL database, artifacts in MinIO S3")

except ClientError as e:
    # boto3 ClientError can surface during artifact upload
    print("❌ Boto3 ClientError during MLflow operations:", e)
    print(traceback.format_exc())
    raise
except Exception:
    print("❌ Unexpected error during MLflow logging:")
    print(traceback.format_exc())
    raise



✅ Logged run id: bac3bc6b46f04d8e9f59d351c6239239
✅ Experiment id: 1
🏃 View run aged-goat-596 at: http://localhost:5001/#/experiments/1/runs/bac3bc6b46f04d8e9f59d351c6239239
🧪 View experiment at: http://localhost:5001/#/experiments/1
✅ MLflow logging complete — check the UI and MinIO for artifact/model.
✅ Data stored in PostgreSQL database, artifacts in MinIO S3


In [None]:
def create_cyclical_features(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    df = df.clone()

    return df.with_columns(
        # month (convert 1-12 to 0-11 for proper cyclical encoding)
        pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
        pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
        # day (Retain original values; 1-31)
        pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * x / 31)).alias("day_sin"),
        pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * x / 31)).alias("day_cos"),
        # day of week (convert 1-7 to 0-6 for proper cyclical encoding)
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7)).alias("day_of_week_sin"),
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7)).alias("day_of_week_cos"),
    )


create_cyclical_features(temp_df, date_col="date")

In [None]:
# Let's examine the cyclical features to see what's wrong
cyclical_result = create_cyclical_features(temp_df, date_col="date")

# Check the cyclical features
cyclical_sample = cyclical_result.select(
    [
        "date",
        "day_of_week",
        "day_of_week_sin",
        "day_of_week_cos",
        "month_sin",
        "month_cos",
    ]
).unique()

print("Cyclical features sample:")
print(cyclical_sample)

print("\nLet's check the day_of_week values and corresponding sin/cos:")
day_check = (
    cyclical_result.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)
print(day_check)

print("\nIssue Analysis:")
print("day_of_week ranges from 1-7 in Polars (Monday=1, Sunday=7)")
print("But for cyclical encoding, we want values from 0 to 2π")
print("Current formula: sin(2π × day_of_week / 7)")
print("This means day 7 gives: sin(2π × 7 / 7) = sin(2π) = 0")
print("And day 1 gives: sin(2π × 1 / 7) = sin(2π/7)")
print("This creates a discontinuity between Sunday (7) and Monday (1)!")

In [None]:
# The issue with day_of_week cyclical encoding:
print("PROBLEM IDENTIFIED:")
print("Polars weekday() returns 1-7 (Monday=1, Sunday=7)")
print("For proper cyclical encoding, we need 0-based indexing")
print()


def create_cyclical_features_corrected(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    df = df.clone()

    return df.with_columns(
        # Month cyclical features (months 1-12, convert to 0-11)
        pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
        pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
        # Day cyclical features (days 1-31, convert to 0-30)
        pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 31)).alias("day_sin"),
        pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 31)).alias("day_cos"),
        # Day of week cyclical features (weekday 1-7, convert to 0-6)
        pl.col(date_col)
        .dt.weekday()
        .map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7))  # CORRECTED: (x-1)
        .alias("day_of_week_sin"),
        pl.col(date_col)
        .dt.weekday()
        .map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7))  # CORRECTED: (x-1)
        .alias("day_of_week_cos"),
    )


# Test the corrected function
print("Testing corrected cyclical features:")
corrected_result = create_cyclical_features_corrected(temp_df, date_col="date")

# Compare day_of_week encoding
comparison = (
    corrected_result.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)

print("\nCorrected day_of_week cyclical encoding:")
print(comparison)

print("\nKey differences:")
print("- Original: sin(2π × day_of_week / 7) where day_of_week ∈ [1,7]")
print("- Corrected: sin(2π × (day_of_week - 1) / 7) where (day_of_week - 1) ∈ [0,6]")
print("- This ensures Monday(1) → 0, Tuesday(2) → 1, ..., Sunday(7) → 6")
print("- Now Sunday(6) and Monday(0) are properly connected in the cycle!")

In [None]:
# Test the corrected function
print("Testing the corrected create_cyclical_features function:")
corrected_cyclical = create_cyclical_features(temp_df, date_col="date")

# Show the cyclical features for each day of week
cyclical_summary = (
    corrected_cyclical.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)

print("\nCorrected cyclical features by day of week:")
print(cyclical_summary)

print("\nVerification:")
print("Monday (1) → sin=0.0, cos=1.0 (start of cycle)")
print("Sunday (7) → sin≈-0.78, cos≈0.62 (connects smoothly back to Monday)")
print("The cyclical encoding now properly represents the weekly cycle!")

In [None]:
-2.4493e-16