# ü§ñ Model Training & Evaluation

H∆∞·ªõng d·∫´n training quantile regression models v√† ƒë√°nh gi√° k·∫øt qu·∫£.

## M·ª•c ti√™u
- Load feature table ƒë√£ ƒë∆∞·ª£c engineering
- Split data theo time-based (leak-safe)
- Train 7 quantile models (Q05, Q10, Q25, Q50, Q75, Q90, Q95)
- ƒê√°nh gi√° v·ªõi Pinball Loss v√† Prediction Interval Coverage


In [3]:
# Setup
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.config import setup_project_path, get_dataset_config, OUTPUT_FILES, setup_logging
from src.pipelines._03_model_training import (
    load_data, prepare_data, train_quantile_models, evaluate_quantile_models
)

setup_project_path()
setup_logging()

# Import display for Jupyter notebooks
try:
    from IPython.display import display
except ImportError:
    # Fallback if not in Jupyter
    display = print

# Get config
config = get_dataset_config()
print(f"Dataset: {config['name']}")
print(f"Target: {config['target_column']}")


Dataset: FreshRetailNet-50K
Target: sales_quantity


## 1. Load Feature Table


In [4]:
# Load master feature table
df = load_data(OUTPUT_FILES['master_feature_table'])
print(f"Feature table shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# Show basic info
print(f"\nDate range:")
if config['temporal_unit'] == 'week':
    print(f"  Weeks: {df[config['time_column']].min()} to {df[config['time_column']].max()}")
else:
    time_col = config['time_column']
    if time_col in df.columns:
        print(f"  Range: {df[time_col].min()} to {df[time_col].max()}")

display(df.head())


2025-11-13 01:31:29,953 - src.pipelines._03_model_training - INFO - Loading data from: D:\datastorm\E-Grocery_Forecaster\data\3_processed\master_feature_table.parquet...
2025-11-13 01:31:29,954 - src.pipelines._03_model_training - ERROR - File not found: D:\datastorm\E-Grocery_Forecaster\data\3_processed\master_feature_table.parquet. Run _02_feature_enrichment.py first.


SystemExit: 1

## 2. Prepare Data & Time-Based Split


In [None]:
# Prepare data with time-based split
X_train, X_test, y_train, y_test, features, cat_features = prepare_data(df, config)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nFeatures: {len(features)}")
print(f"Categorical features: {len(cat_features)}")

# Verify leak-safety
if config['temporal_unit'] == 'week':
    train_weeks = df.loc[X_train.index, config['time_column']].max()
    test_weeks = df.loc[X_test.index, config['time_column']].min()
    print(f"\n‚úÖ Leak-safe check:")
    print(f"  Train max week: {train_weeks}")
    print(f"  Test min week: {test_weeks}")
    print(f"  Gap: {test_weeks - train_weeks} weeks")


## 3. Train Quantile Models


In [None]:
# Train all quantile models
quantile_models = train_quantile_models(X_train, y_train, cat_features)

print(f"Trained {len(quantile_models)} quantile models:")
for q, model in quantile_models.items():
    print(f"  Q{int(q*100):02d}: {type(model).__name__}")


## 4. Evaluate Models


In [None]:
# Evaluate models
metrics = evaluate_quantile_models(quantile_models, X_test, y_test)

print("=" * 70)
print("EVALUATION RESULTS")
print("=" * 70)

# Pinball losses
print("\nPinball Losses:")
for q in sorted(quantile_models.keys()):
    key = f"q{int(q*100):02d}_pinball_loss"
    if key in metrics:
        print(f"  Q{int(q*100):02d}: {metrics[key]:.4f}")

# Coverage
print("\nPrediction Interval Coverage:")
for interval in ['90', '80']:
    key = f"prediction_interval_coverage_{interval}"
    if key in metrics:
        print(f"  {interval}% interval: {metrics[key]:.4f}")

# Save metrics
metrics_file = OUTPUT_FILES['reports_dir'] / 'metrics' / 'quantile_model_metrics.json'
metrics_file.parent.mkdir(parents=True, exist_ok=True)
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"\n‚úÖ Metrics saved to: {metrics_file}")
