In [1]:
import os
import warnings
from typing import Any

import numpy as np
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/End-to-end-Sale-Forecasting


In [4]:
import httpx

url: str = "https://jsonplaceholder.typicode.com/posts"

response = httpx.get(url, timeout=10)
response.raise_for_status()  # Raise an error for bad responses
console.print(response.json()[:3], style="info")

In [5]:
from include.config import app_settings
from include.utilities.data_gen import RealisticSalesDataGenerator

gen_data = RealisticSalesDataGenerator(start_date="2025-01-31", end_date="2025-09-30", seed=123)
file_paths: dict[str, Any] = gen_data.generate_sales_data(output_dir="./data/sales_data")
file_paths

2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-01-31
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-01
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-02
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-03
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-04
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-05
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-06
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-07
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-08
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-09
2025-09-05 15:27:15 - include.utilities.data_gen - [INFO] - Generating data for 2025-02-10

{'sales': ['./data/sales_data/sales/year=2025/month=02/day=01/sales_2025-02-01.parquet',
  './data/sales_data/sales/year=2025/month=02/day=02/sales_2025-02-02.parquet',
  './data/sales_data/sales/year=2025/month=02/day=03/sales_2025-02-03.parquet',
  './data/sales_data/sales/year=2025/month=02/day=08/sales_2025-02-08.parquet',
  './data/sales_data/sales/year=2025/month=02/day=09/sales_2025-02-09.parquet',
  './data/sales_data/sales/year=2025/month=02/day=15/sales_2025-02-15.parquet',
  './data/sales_data/sales/year=2025/month=02/day=16/sales_2025-02-16.parquet',
  './data/sales_data/sales/year=2025/month=02/day=17/sales_2025-02-17.parquet',
  './data/sales_data/sales/year=2025/month=02/day=22/sales_2025-02-22.parquet',
  './data/sales_data/sales/year=2025/month=02/day=23/sales_2025-02-23.parquet',
  './data/sales_data/sales/year=2025/month=03/day=01/sales_2025-03-01.parquet',
  './data/sales_data/sales/year=2025/month=03/day=03/sales_2025-03-03.parquet',
  './data/sales_data/sales/year

In [6]:
total_files = sum(len(paths) for paths in file_paths.values())
total_files

440

In [None]:
# Convert to Polars
import mlflow
from polars.dataframe.frame import DataFrame

from include.ml.trainer import ModelTrainer

print("Loading sales data from multiple files...")
sales_dfs: list[pl.DataFrame] = []
max_files: int = 50
skipped_sales: int = 0

for i, sales_file in enumerate(file_paths["sales"][:max_files]):
    try:
        df = pl.read_parquet(sales_file)
        sales_dfs.append(df)
    except Exception as e:
        skipped_sales += 1
        print(f"  Skipping unreadable sales file {sales_file}: {e}")
        continue
    if (i + 1) % 10 == 0:
        print(f"  Loaded {i + 1} files...")
if not sales_dfs:
    raise ValueError("No readable sales parquet files were loaded; aborting training")

sales_df = pl.concat(sales_dfs)
print(f"Combined sales data shape: {sales_df.shape}")
daily_sales: DataFrame = (
    sales_df.group_by(["date", "store_id", "product_id", "category"])
    .agg(
        pl.col("quantity_sold").sum(),
        pl.col("revenue").sum().alias("sales"),
        pl.col("cost").sum(),
        pl.col("profit").sum(),
        pl.col("discount_percent").mean(),
        pl.col("unit_price").mean(),
    )
    .sort("date", "store_id")
)

if file_paths.get("promotions"):
    try:
        promo_df = pl.read_parquet(file_paths["promotions"][0])
        promo_summary = (
            promo_df.group_by(["date", "product_id"])
            .agg(pl.col("discount_percent").max())
            .with_columns(pl.lit(1).cast(pl.Int8).alias("has_promotion"))
        )
        daily_sales = daily_sales.join(
            promo_summary.select(["date", "product_id", "has_promotion"]),
            on=["date", "product_id"],
            how="left",
        ).with_columns(pl.col("has_promotion").fill_null(0))
    except Exception as e:
        print(f"Skipping promotions merge due to error: {e}")

if file_paths.get("customer_traffic"):
    traffic_dfs: list[pl.DataFrame] = []
    skipped_traffic: int = 0

    for traffic_file in file_paths["customer_traffic"][:10]:
        try:
            traffic_dfs.append(pl.read_parquet(traffic_file))
        except Exception as e:
            skipped_traffic += 1
            print(f"  Skipping unreadable traffic file {traffic_file}: {e}")

    if traffic_dfs:
        traffic_df = pl.concat(traffic_dfs)
        traffic_summary = traffic_df.group_by(["date", "store_id"]).agg(
            pl.col("customer_traffic").sum(), pl.col("is_holiday").max()
        )
        daily_sales = daily_sales.join(
            traffic_summary,
            on=["date", "store_id"],
            how="left",
        )
    else:
        print("No readable traffic files; skipping merge")
print(f"Final training data shape: {daily_sales.shape}")
print(f"Columns: {daily_sales.columns}")

trainer = ModelTrainer()
store_daily_sales: DataFrame = (
    daily_sales.group_by(["date", "store_id"])
    .agg(
        pl.col("sales").sum(),
        pl.col("quantity_sold").sum(),
        pl.col("profit").sum(),
        pl.col("has_promotion").mean(),
        pl.col("customer_traffic").first(),
        pl.col("is_holiday").first(),
    )
    .with_columns(pl.col("date").cast(pl.Date))
)
train_df, val_df, test_df = trainer.prepare_data(
    store_daily_sales,
    target_col="sales",
    group_cols=["store_id"],
    categorical_cols=["store_id"],
)
print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, Test shape: {test_df.shape}")

Loading sales data from multiple files...
  Loaded 10 files...
  Loaded 20 files...
  Loaded 30 files...
  Loaded 40 files...
  Loaded 50 files...
Combined sales data shape: (174, 10)
Final training data shape: (174, 13)
Columns: ['date', 'store_id', 'product_id', 'category', 'quantity_sold', 'sales', 'cost', 'profit', 'discount_percent', 'unit_price', 'has_promotion', 'customer_traffic', 'is_holiday']
2025-09-05 15:56:26 - include.utilities.feature_engineering - [INFO] - Starting feature engineering pipeline
2025-09-05 15:56:26 - include.utilities.feature_engineering - [INFO] - Created 7 lag features
2025-09-05 15:56:26 - include.utilities.feature_engineering - [INFO] - Feature engineering pipeline completed. 41 total features.
2025-09-05 15:56:26 - include.ml.trainer - [INFO] - Data split - {"train_size": 60, "validation_size": 8, "test_size": 18}
Train shape: (60, 41), Val shape: (8, 41), Test shape: (18, 41)


In [18]:
mlflow.end_run()

results = trainer.train_all_models(train_df, val_df, test_df, target_col="sales")
for model_name, model_results in results.items():
    if "metrics" in model_results:
        print(f"\n{model_name} metrics:")
        for metric, value in model_results["metrics"].items():
            print(f"  {metric}: {value:.4f}")
print("\nVisualization charts have been generated and saved to MLflow/MinIO")
print("Charts include:")
print("  - Model metrics comparison")
print("  - Predictions vs actual values")
print("  - Residuals analysis")
print("  - Error distribution")
print("  - Feature importance comparison")

serializable_results: dict[str, dict[str, Any]] = {}
for model_name, model_results in results.items():
    serializable_results[model_name] = {"metrics": model_results.get("metrics", {})}

serializable_results: dict[str, dict[str, Any]] = {}
for model_name, model_results in results.items():
    serializable_results[model_name] = {"metrics": model_results.get("metrics", {})}


current_run = trainer.mlflow_manager.get_run_id()
final_results: dict[str, Any] = {
    "training_results": serializable_results,
    "mlflow_run_id": current_run,
}
console.print(final_results, style="info")

2025-09-05 15:56:26 - include.utilities.mlflow_utils - [INFO] - Started MLflow run: ba4fdb57e96742fa8000536ea248c5aa
2025-09-05 15:56:26 - include.ml.trainer - [INFO] - Training XGBoost model
[0]	validation_0-rmse:62.68329
[1]	validation_0-rmse:55.46352
[2]	validation_0-rmse:51.54056
[3]	validation_0-rmse:45.53690
[4]	validation_0-rmse:42.28120
[5]	validation_0-rmse:39.15963
[6]	validation_0-rmse:36.29725
[7]	validation_0-rmse:33.63027
[8]	validation_0-rmse:30.93414
[9]	validation_0-rmse:28.67686
[10]	validation_0-rmse:26.89091
[11]	validation_0-rmse:25.05359
[12]	validation_0-rmse:23.38517
[13]	validation_0-rmse:21.83391
[14]	validation_0-rmse:20.36703
[15]	validation_0-rmse:19.08786
[16]	validation_0-rmse:17.95905
[17]	validation_0-rmse:16.84690
[18]	validation_0-rmse:15.87859
[19]	validation_0-rmse:14.94017
[20]	validation_0-rmse:14.10110
[21]	validation_0-rmse:13.41335
[22]	validation_0-rmse:12.75128
[23]	validation_0-rmse:12.13010
[24]	validation_0-rmse:11.57988
[25]	validation_0-

In [None]:
{
    "xgboost": {"r2": 0.9846403008371715, "mae": 3.054988497779483, "mape": 4.45330527412093, "rmse": 5.2594461034181315},
    "ensemble": {"r2": 0.9827928058335911, "mae": 3.6552595561744905, "mape": 5.832129982272972, "rmse": 5.566775200803074},
    "lightgbm": {"r2": 0.9781911080583598, "mae": 4.599852955821699, "mape": 8.03958585718776, "rmse": 6.267083116474738},
}

In [21]:
len(a.get("file_paths")["sales"])

194

In [None]:
df: pl.DataFrame = pl.DataFrame(
    data={
        "id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "Bob"],
        "role": ["Engineer", "Manager", "Engineer", "Manager"],
        "skill": ["Python", "Leadership", "Python", "Management"],
        "experience": [5, 2, 3, 3],
        "age": [30, 40, 35, 34],
        "target": [1, 0, 1, 1],
    }
)

df

In [None]:
counts = df["name"].value_counts()
mean_target = df.group_by("name").agg(pl.col("target").mean())
display(mean_target)
display(counts["name"])
for row in counts["name"]:
    print(counts.filter(pl.col("name").eq(row))["count"].item())

counts.filter(pl.col("name").eq("Alice"))["count"].item()

In [None]:
temp_df

### Connect To MLFlow

- Set the `tracking URI` to the MLflow server.
    - Tracking URI requires the MLflow `server address`, `port`, `S3 endpoint URL`, and `S3 credentials`.
    - S3 credentials include `access key`, `secret key`, and `bucket name`.
    - `MinIO` is used as a local S3-compatible storage service.

- Verify the connection by listing experiments.

In [None]:
# Force localhost configuration and debug
RUNNING_IN_DOCKER = False
DEFAULT_MINIO_HOST = app_settings.AWS_S3_HOST if RUNNING_IN_DOCKER else "minio"
DEFAULT_MINIO_PORT = app_settings.AWS_S3_PORT
MINIO_ENDPOINT = app_settings.mlflow_s3_endpoint_url
# This connects to the MLflow server with PostgreSQL backend
MLFLOW_URI = app_settings.mlflow_tracking_uri
AWS_KEY = app_settings.AWS_ACCESS_KEY_ID
AWS_SECRET = app_settings.AWS_SECRET_ACCESS_KEY.get_secret_value()
AWS_REGION = app_settings.AWS_DEFAULT_REGION
BUCKET = app_settings.AWS_S3_BUCKET

# Set environment variables
os.environ["AWS_ACCESS_KEY_ID"] = app_settings.AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET
os.environ["AWS_DEFAULT_REGION"] = AWS_REGION
os.environ["MLFLOW_S3_ENDPOINT_URL"] = MINIO_ENDPOINT

print("=== CONFIGURATION DEBUG ===")
print(f"RUNNING_IN_DOCKER: {RUNNING_IN_DOCKER}")
print(f"DEFAULT_MINIO_HOST: {DEFAULT_MINIO_HOST}")
print(f"MINIO_ENDPOINT: {MINIO_ENDPOINT}")
print(f"MLFLOW_URI: {MLFLOW_URI}")
print(f"AWS_ACCESS_KEY_ID: {AWS_KEY}")
print(f"BUCKET: {BUCKET}")
print(f"Environment MLFLOW_S3_ENDPOINT_URL: {MINIO_ENDPOINT}")
print("=== END CONFIGURATION DEBUG ===\n")

In [None]:
# Test MLflow server connection and S3 storage
import tempfile
import traceback

import boto3
import mlflow
from botocore.exceptions import ClientError

# 1) Test S3/MinIO connection
print("Testing S3/MinIO connection...")
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET,
    region_name=AWS_REGION,
)

try:
    s3.head_bucket(Bucket=BUCKET)
    print(f"✅ Bucket '{BUCKET}' is reachable")
except ClientError as e:
    print(f"❌ S3/MinIO connection failed: {e}")

# 2) Test MLflow server connection
print(f"\nTesting MLflow server connection to {MLFLOW_URI}...")
mlflow.set_tracking_uri(MLFLOW_URI)
print(f"✅ MLflow tracking URI set to: {mlflow.get_tracking_uri()}")

# 3) Test that MLflow uses PostgreSQL backend (not local files)
try:
    # This should connect to the MLflow server which uses PostgreSQL
    experiments = mlflow.search_experiments()
    print(f"✅ Connected to MLflow server. Found {len(experiments)} experiments.")
    print("✅ This confirms MLflow is using the PostgreSQL backend, not local files.")
except Exception as e:
    print(f"❌ Failed to connect to MLflow server: {e}")

print("\n" + "=" * 50)
print("IMPORTANT: If MLflow server is using PostgreSQL correctly,")
print("experiments and runs will be stored in the database,")
print("and artifacts will be stored in MinIO/S3.")
print("Local 'mlruns' folders should NOT be created.")
print("=" * 50)

In [None]:
import mlflow
import mlflow.sklearn
from botocore.exceptions import ClientError
from sklearn import datasets
from sklearn.linear_model import ElasticNet

try:
    mlflow.set_experiment("notebook_quick_test")
    X, y = datasets.load_diabetes(return_X_y=True)
    model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    model.fit(X, y)

    with mlflow.start_run() as run:
        mlflow.log_param("alpha", 0.1)
        mlflow.log_param("l1_ratio", 0.5)
        mlflow.log_metric("dummy_score", model.score(X, y))

        # Create a small artifact file and upload
        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as tmp:
            tmp.write("mlflow artifact test")
            tmp_path = tmp.name

        mlflow.log_artifact(tmp_path, artifact_path="test_artifacts")
        mlflow.sklearn.log_model(model, "model", input_example=X[:2].tolist())

        # Remove temp file after logging
        os.remove(tmp_path)

        print("✅ Logged run id:", run.info.run_id)
        print("✅ Experiment id:", run.info.experiment_id)

    print("✅ MLflow logging complete — check the UI and MinIO for artifact/model.")
    print("✅ Data stored in PostgreSQL database, artifacts in MinIO S3")

except ClientError as e:
    # boto3 ClientError can surface during artifact upload
    print("❌ Boto3 ClientError during MLflow operations:", e)
    print(traceback.format_exc())
    raise
except Exception:
    print("❌ Unexpected error during MLflow logging:")
    print(traceback.format_exc())
    raise

In [None]:
def create_cyclical_features(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    df = df.clone()

    return df.with_columns(
        # month (convert 1-12 to 0-11 for proper cyclical encoding)
        pl.col(date_col).dt.month().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 12)).alias("month_sin"),
        pl.col(date_col).dt.month().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 12)).alias("month_cos"),
        # day (Retain original values; 1-31)
        pl.col(date_col).dt.day().map_elements(lambda x: np.sin(2 * np.pi * x / 31)).alias("day_sin"),
        pl.col(date_col).dt.day().map_elements(lambda x: np.cos(2 * np.pi * x / 31)).alias("day_cos"),
        # day of week (convert 1-7 to 0-6 for proper cyclical encoding)
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.sin(2 * np.pi * (x - 1) / 7)).alias("day_of_week_sin"),
        pl.col(date_col).dt.weekday().map_elements(lambda x: np.cos(2 * np.pi * (x - 1) / 7)).alias("day_of_week_cos"),
    )


create_cyclical_features(temp_df, date_col="date")

In [None]:
# Let's examine the cyclical features to see what's wrong
cyclical_result = create_cyclical_features(temp_df, date_col="date")

# Check the cyclical features
cyclical_sample = cyclical_result.select(
    [
        "date",
        "day_of_week",
        "day_of_week_sin",
        "day_of_week_cos",
        "month_sin",
        "month_cos",
    ]
).unique()

print("Cyclical features sample:")
print(cyclical_sample)

print("\nLet's check the day_of_week values and corresponding sin/cos:")
day_check = (
    cyclical_result.select(["date", "day_of_week", "day_of_week_sin", "day_of_week_cos"]).unique().sort("day_of_week")
)
print(day_check)

print("\nIssue Analysis:")
print("day_of_week ranges from 1-7 in Polars (Monday=1, Sunday=7)")
print("But for cyclical encoding, we want values from 0 to 2π")
print("Current formula: sin(2π × day_of_week / 7)")
print("This means day 7 gives: sin(2π × 7 / 7) = sin(2π) = 0")
print("And day 1 gives: sin(2π × 1 / 7) = sin(2π/7)")
print("This creates a discontinuity between Sunday (7) and Monday (1)!")

## Docker Container Import Testing

When working with the Airflow containers, imports work correctly when you run Python from the right directory.

### ✅ Correct way to import in Airflow containers:

```bash
# Start container shell from the correct directory
docker compose exec airflow-worker bash

# You'll be in /opt/airflow - this is the correct working directory
pwd  # Should show: /opt/airflow

# Now run Python and import
python
```

```python
# These imports will work correctly:
import pandas as pd
from include.config import app_settings
from include.utilities.data_gen import RealisticSalesDataGenerator

# Test the imports
print("All imports successful!")
print("MLFLOW_HOST:", app_settings.MLFLOW_HOST)
gen = RealisticSalesDataGenerator(start_date="2025-09-01", end_date="2025-09-02", seed=42)
print("Data generator created:", type(gen))
```

### ❌ Common mistake - don't do this:

```bash
# Don't cd into the include directory first
cd include  # This breaks imports!
python      # Imports will fail from here
```

### Why this happens:

1. Our `PYTHONPATH` is set to `/opt/airflow/include`
2. When you run `python` from `/opt/airflow/include`, Python adds `.` (current directory) to sys.path
3. This creates a conflict where Python tries to import `include` from within itself
4. The solution: always run Python from `/opt/airflow` directory

In [None]:
# Test imports in Docker container (run this to verify everything works)
import json
import subprocess


def test_docker_imports():
    """Test that imports work correctly in the Airflow container."""

    # Test command to run in the container
    test_script = """
import sys
import pandas as pd
from include.config import app_settings
from include.utilities.data_gen import RealisticSalesDataGenerator

# Test results
results = {
    "python_path_includes_include": "/opt/airflow/include" in sys.path,
    "current_working_directory": __import__("os").getcwd(),
    "pandas_version": pd.__version__,
    "mlflow_host": app_settings.MLFLOW_HOST,
    "data_generator_created": str(type(RealisticSalesDataGenerator(start_date="2025-09-01", end_date="2025-09-02", seed=42)))
}

import json
print(json.dumps(results, indent=2))
"""

    try:
        # Run the test in the container
        cmd = [
            "docker",
            "compose",
            "exec",
            "-T",
            "airflow-worker",
            "python",
            "-c",
            test_script,
        ]

        result = subprocess.run(cmd, capture_output=True, text=True, cwd="../")

        if result.returncode == 0:
            test_results = json.loads(result.stdout.strip())
            print("✅ Docker container import test PASSED!")
            print("\nTest Results:")
            for key, value in test_results.items():
                print(f"  {key}: {value}")
            return True
        print("❌ Docker container import test FAILED!")
        print("Error output:", result.stderr)
        return False

    except Exception as e:
        print(f"❌ Failed to run Docker test: {e}")
        return False


# Run the test
test_docker_imports()