# Notebook 18a: Generate All Model Predictions for Validation

This notebook consolidates predictions from all available forecasting models to enable comprehensive comparison.

**Available Models:**
1. **Human Method**: 2024 annual total ÷ 12 (traditional budgeting)
2. **Seasonal Naive**: Use 2024 same-month values
3. **XGBoost**: Gradient boosting with lag features
4. **CatBoost**: Gradient boosting with categorical features
5. **LightGBM**: Microsoft's gradient boosting
6. **Ensemble (Best Model)**: Select best performer per metric
7. **Ensemble (Weighted)**: Weight models by inverse MAPE
8. **Ensemble (Hybrid)**: 60% ML + 40% Human

**Output:** Single CSV with all predictions for Jan-Sep 2025 (validation period)

**Focus Metrics:** `total_orders` and `revenue_total` (key business metrics)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print('Notebook 18a: Generate All Model Predictions')
print('='*80)

## Section 1: Load All Available 2025 Forecasts

In [None]:
data_dir = Path('../data/processed')

# Focus on orders and revenue only
focus_metrics = ['total_orders', 'revenue_total']

# Filter to Jan-Sep 2025 (validation period with actual data)
val_start = '2025-01-01'
val_end = '2025-09-01'

print('Loading 2025 forecasts from all models...')
print(f'Validation period: {val_start} to {val_end} (9 months)\n')

In [None]:
# 1. Human Method (2024 ÷ 12)
df_human = pd.read_csv(data_dir / 'human_method_2025.csv')
df_human['date'] = pd.to_datetime(df_human['date'])
df_human = df_human[(df_human['date'] >= val_start) & (df_human['date'] <= val_end)].copy()

print(f'✓ Human Method: {len(df_human)} months')
for metric in focus_metrics:
    print(f'  {metric}: {df_human[metric].iloc[0]:,.0f} (constant)')

In [None]:
# 2. Seasonal Naive
df_seasonal = pd.read_csv(data_dir / 'seasonal_naive_2025.csv')
df_seasonal['date'] = pd.to_datetime(df_seasonal['date'])
df_seasonal = df_seasonal[(df_seasonal['date'] >= val_start) & (df_seasonal['date'] <= val_end)].copy()

print(f'\n✓ Seasonal Naive: {len(df_seasonal)} months')
for metric in focus_metrics:
    print(f'  {metric}: {df_seasonal[metric].min():,.0f} - {df_seasonal[metric].max():,.0f}')

In [None]:
# 3. XGBoost
try:
    df_xgboost = pd.read_csv(data_dir / 'xgboost_forecast_2025.csv')
    df_xgboost['date'] = pd.to_datetime(df_xgboost['date'])
    df_xgboost = df_xgboost[(df_xgboost['date'] >= val_start) & (df_xgboost['date'] <= val_end)].copy()
    
    print(f'\n✓ XGBoost: {len(df_xgboost)} months')
    for metric in focus_metrics:
        if metric in df_xgboost.columns:
            print(f'  {metric}: {df_xgboost[metric].min():,.0f} - {df_xgboost[metric].max():,.0f}')
except FileNotFoundError:
    print('\n⚠️  XGBoost forecast not found')
    df_xgboost = None

In [None]:
# 4. CatBoost
df_catboost = pd.read_csv(data_dir / 'catboost_forecast_2025.csv')
df_catboost['date'] = pd.to_datetime(df_catboost['date'])
df_catboost = df_catboost[(df_catboost['date'] >= val_start) & (df_catboost['date'] <= val_end)].copy()

print(f'\n✓ CatBoost: {len(df_catboost)} months')
for metric in focus_metrics:
    print(f'  {metric}: {df_catboost[metric].min():,.0f} - {df_catboost[metric].max():,.0f}')

In [None]:
# 5. LightGBM
df_lightgbm = pd.read_csv(data_dir / 'lightgbm_forecast_2025.csv')
df_lightgbm['date'] = pd.to_datetime(df_lightgbm['date'])
df_lightgbm = df_lightgbm[(df_lightgbm['date'] >= val_start) & (df_lightgbm['date'] <= val_end)].copy()

print(f'\n✓ LightGBM: {len(df_lightgbm)} months')
for metric in focus_metrics:
    print(f'  {metric}: {df_lightgbm[metric].min():,.0f} - {df_lightgbm[metric].max():,.0f}')

In [None]:
# 6-8. Ensemble Methods
df_ensemble_best = pd.read_csv(data_dir / 'ensemble_best_model_2025.csv')
df_ensemble_best['date'] = pd.to_datetime(df_ensemble_best['date'])
df_ensemble_best = df_ensemble_best[(df_ensemble_best['date'] >= val_start) & (df_ensemble_best['date'] <= val_end)].copy()

df_ensemble_weighted = pd.read_csv(data_dir / 'ensemble_weighted_2025.csv')
df_ensemble_weighted['date'] = pd.to_datetime(df_ensemble_weighted['date'])
df_ensemble_weighted = df_ensemble_weighted[(df_ensemble_weighted['date'] >= val_start) & (df_ensemble_weighted['date'] <= val_end)].copy()

df_ensemble_hybrid = pd.read_csv(data_dir / 'ensemble_hybrid_2025.csv')
df_ensemble_hybrid['date'] = pd.to_datetime(df_ensemble_hybrid['date'])
df_ensemble_hybrid = df_ensemble_hybrid[(df_ensemble_hybrid['date'] >= val_start) & (df_ensemble_hybrid['date'] <= val_end)].copy()

print(f'\n✓ Ensemble (Best Model): {len(df_ensemble_best)} months')
print(f'✓ Ensemble (Weighted): {len(df_ensemble_weighted)} months')
print(f'✓ Ensemble (Hybrid 60/40): {len(df_ensemble_hybrid)} months')

## Section 2: Consolidate All Predictions

Create a single dataframe with all model predictions for easy comparison.

In [None]:
# Initialize consolidated dataframe
df_all_predictions = pd.DataFrame({
    'date': df_human['date']
})

print('Consolidating predictions...')
print('='*80)

# Add predictions from each model (focus metrics only)
for metric in focus_metrics:
    print(f'\n{metric}:')
    
    # Human
    df_all_predictions[f'{metric}_human'] = df_human[metric].values
    print(f'  ✓ Human')
    
    # Seasonal Naive
    df_all_predictions[f'{metric}_seasonal_naive'] = df_seasonal[metric].values
    print(f'  ✓ Seasonal Naive')
    
    # XGBoost (if available)
    if df_xgboost is not None and metric in df_xgboost.columns:
        df_all_predictions[f'{metric}_xgboost'] = df_xgboost[metric].values
        print(f'  ✓ XGBoost')
    
    # CatBoost
    df_all_predictions[f'{metric}_catboost'] = df_catboost[metric].values
    print(f'  ✓ CatBoost')
    
    # LightGBM
    df_all_predictions[f'{metric}_lightgbm'] = df_lightgbm[metric].values
    print(f'  ✓ LightGBM')
    
    # Ensembles
    df_all_predictions[f'{metric}_ensemble_best'] = df_ensemble_best[metric].values
    df_all_predictions[f'{metric}_ensemble_weighted'] = df_ensemble_weighted[metric].values
    df_all_predictions[f'{metric}_ensemble_hybrid'] = df_ensemble_hybrid[metric].values
    print(f'  ✓ Ensemble (Best Model)')
    print(f'  ✓ Ensemble (Weighted)')
    print(f'  ✓ Ensemble (Hybrid)')

print('\n' + '='*80)
print(f'✓ Consolidated predictions: {len(df_all_predictions)} months × {len(df_all_predictions.columns)-1} forecasts')

## Section 3: Summary Statistics

Display summary statistics for all models.

In [None]:
print('\nSummary Statistics: All Models')
print('='*80)

for metric in focus_metrics:
    print(f'\n{metric.upper()}:')
    print('-'*80)
    
    # Get all columns for this metric
    metric_cols = [col for col in df_all_predictions.columns if col.startswith(metric)]
    
    for col in metric_cols:
        model_name = col.replace(f'{metric}_', '').replace('_', ' ').title()
        values = df_all_predictions[col]
        
        print(f'{model_name:25s}: {values.mean():>12,.0f} avg, {values.min():>12,.0f} min, {values.max():>12,.0f} max')

## Section 4: Model Variability Check

Identify models with flat predictions (sign of overfitting or insufficient training data).

In [None]:
print('\nModel Variability Check (Coefficient of Variation):')
print('='*80)
print('Models with CV < 2% may have insufficient monthly variation\n')

for metric in focus_metrics:
    print(f'{metric.upper()}:')
    
    metric_cols = [col for col in df_all_predictions.columns if col.startswith(metric)]
    
    for col in metric_cols:
        model_name = col.replace(f'{metric}_', '').replace('_', ' ').title()
        values = df_all_predictions[col]
        
        # Coefficient of Variation (CV) = std / mean
        cv = (values.std() / values.mean()) * 100
        
        status = '✓' if cv >= 2 else '⚠️ '
        print(f'  {status} {model_name:25s}: CV = {cv:5.2f}%')
    
    print()

## Section 5: Save Consolidated Predictions

In [None]:
# Save to CSV
output_path = data_dir / 'all_model_predictions_2025_validation.csv'
df_all_predictions.to_csv(output_path, index=False)

print(f'✓ Saved: {output_path.name}')
print(f'  {len(df_all_predictions)} rows × {len(df_all_predictions.columns)} columns')

# Display first few rows
print('\nFirst 3 months (orders):')
orders_cols = ['date'] + [col for col in df_all_predictions.columns if col.startswith('total_orders')]
display(df_all_predictions[orders_cols].head(3))

print('\nFirst 3 months (revenue):')
revenue_cols = ['date'] + [col for col in df_all_predictions.columns if col.startswith('revenue_total')]
display(df_all_predictions[revenue_cols].head(3))

## Section 6: Model Availability Summary

In [None]:
print('\n' + '='*80)
print('MODEL AVAILABILITY SUMMARY')
print('='*80)

print('\nTraditional Methods:')
print('  ✓ Human (2024÷12)')
print('  ✓ Seasonal Naive')

print('\nMachine Learning Models:')
if df_xgboost is not None:
    print('  ✓ XGBoost')
else:
    print('  ✗ XGBoost (not available)')
print('  ✓ CatBoost')
print('  ✓ LightGBM')

print('\nEnsemble Methods:')
print('  ✓ Best Model per Metric')
print('  ✓ Weighted (Inverse MAPE)')
print('  ✓ Hybrid (60% ML / 40% Human)')

# Count total models
total_models = len([col for col in df_all_predictions.columns if col.startswith('total_orders_')])
print(f'\nTotal models available: {total_models}')

print('\n' + '='*80)
print('READY FOR VALIDATION')
print('='*80)
print('Next: Run Notebook 18 (updated) to compare all models against actual 2025 data')