# Prophet Forecasting Model (Model A - Seasonality Focus)

This notebook implements Facebook Prophet for time series forecasting with custom seasonalities and Swiss holidays.

## Prophet Advantages
- **Interpretable**: Decompose trend, seasonality, holidays
- **Robust**: Handles missing data and outliers
- **Flexible**: Custom seasonalities (quarterly, monthly patterns)
- **Business-friendly**: Easy to explain to stakeholders

## Configuration
- **Seasonality Mode**: Multiplicative (% changes over time)
- **Custom Seasonalities**: Quarterly (91.25 days), Monthly (30.5 days)
- **Swiss Holidays**: National Day, Christmas, Easter, etc.
- **Changepoint Prior**: 0.05 (moderate flexibility)

In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

## Section 1: Load Time Series Data

In [None]:
# Load company-level time series
data_path = Path('../data/processed/monthly_aggregated_full_company.parquet')

if not data_path.exists():
    data_path = Path('../data/processed/monthly_aggregated_full_company.csv')
    df = pd.read_csv(data_path)
    df['date'] = pd.to_datetime(df['date'])
else:
    df = pd.read_parquet(data_path)

df = df.sort_values('date').reset_index(drop=True)

print(f"Loaded: {len(df)} months ({df['date'].min()} to {df['date'].max()})")

## Section 2: Define Swiss Holidays

Add Swiss national holidays that may impact transport demand.

In [None]:
def create_swiss_holidays():
    """
    Create Swiss holiday dataframe for Prophet.
    
    Includes:
    - New Year's Day (Jan 1)
    - Swiss National Day (Aug 1)
    - Christmas (Dec 25-26)
    - Easter Monday (variable)
    """
    holidays = pd.DataFrame({
        'holiday': ['new_year', 'national_day', 'christmas', 'boxing_day'] * 4,
        'ds': pd.to_datetime([
            # 2022
            '2022-01-01', '2022-08-01', '2022-12-25', '2022-12-26',
            # 2023
            '2023-01-01', '2023-08-01', '2023-12-25', '2023-12-26',
            # 2024
            '2024-01-01', '2024-08-01', '2024-12-25', '2024-12-26',
            # 2025
            '2025-01-01', '2025-08-01', '2025-12-25', '2025-12-26',
        ]),
        'lower_window': 0,
        'upper_window': 1
    })
    
    # Add Easter Monday (movable feast)
    easter_dates = [
        '2022-04-18', '2023-04-10', '2024-04-01', '2025-04-21', '2026-04-06'
    ]
    easter_df = pd.DataFrame({
        'holiday': 'easter_monday',
        'ds': pd.to_datetime(easter_dates),
        'lower_window': 0,
        'upper_window': 1
    })
    
    holidays = pd.concat([holidays, easter_df], ignore_index=True)
    
    return holidays

swiss_holidays = create_swiss_holidays()
print(f"Swiss holidays defined: {len(swiss_holidays)} dates")
print(swiss_holidays.head(10))

## Section 3: Train/Validation Split

In [None]:
target_metrics = [
    'total_orders',
    'total_km_billed',
    'total_km_actual',
    'total_tours',
    'total_drivers',
    'revenue_total',
    'external_drivers',
    'vehicle_km_cost',      # NEW: KM-based transportation cost
    'vehicle_time_cost',    # NEW: Time-based transportation cost
    'total_vehicle_cost'    # NEW: Total vehicle operational cost
]

# Backward compatibility check
if 'total_km' in df.columns and 'total_km_billed' not in df.columns:
    target_metrics = [m.replace('total_km_billed', 'total_km') if m == 'total_km_billed' else m for m in target_metrics]
    target_metrics = [m for m in target_metrics if m != 'total_km_actual']  # Remove if not available

In [None]:
# Split data: Use all 36 months as training (no validation split needed for baselines)
# Since we don't have 2025 data yet, we'll use the full historic data as training
# and demonstrate forecasting forward

print("Creating train/validation split...")
print("="*80)

# For Prophet models, we'll use last 6 months as validation
split_date = '2024-07-01'

train_df = df[df['date'] < split_date].copy()
val_df = df[df['date'] >= split_date].copy()

print(f"\nTraining data:")
print(f"  Date range: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"  Months: {len(train_df)}")

print(f"\nValidation data:")
print(f"  Date range: {val_df['date'].min()} to {val_df['date'].max()}")
print(f"  Months: {len(val_df)}")

print(f"\n✓ Split complete!")
print(f"  Training: {len(train_df)} months (Jan 2022 - Jun 2024)")
print(f"  Validation: {len(val_df)} months (Jul 2024 - Dec 2024)")

## Section 4: Prophet Model Training

Train one Prophet model per target metric.

In [None]:
def train_prophet_model(train_data, metric, holidays=None):
    """
    Train Prophet model with custom configuration.
    
    Parameters:
    -----------
    train_data : pd.DataFrame
        Training time series with 'date' column
    metric : str
        Target metric column name
    holidays : pd.DataFrame
        Holiday dataframe (optional)
    
    Returns:
    --------
    Prophet
        Trained Prophet model
    """
    # Prepare data in Prophet format
    prophet_df = pd.DataFrame({
        'ds': train_data['date'],
        'y': train_data[metric]
    })
    
    # Initialize Prophet
    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,  # Monthly data, not relevant
        daily_seasonality=False,   # Monthly data, not relevant
        seasonality_mode='multiplicative',  # Percentage-based seasonality
        changepoint_prior_scale=0.05,  # Moderate flexibility (default)
        holidays=holidays,
        interval_width=0.80  # 80% confidence intervals
    )
    
    # Add custom seasonalities
    # Quarterly seasonality (91.25 days)
    model.add_seasonality(
        name='quarterly',
        period=91.25,
        fourier_order=5
    )
    
    # Monthly seasonality (30.5 days average)
    model.add_seasonality(
        name='monthly',
        period=30.5,
        fourier_order=10
    )
    
    # Fit model
    print(f"\nTraining Prophet for {metric}...")
    model.fit(prophet_df)
    print(f"✓ Training complete")
    
    return model

# Train models for each metric
prophet_models = {}

for metric in target_metrics:
    model = train_prophet_model(train_df, metric, holidays=swiss_holidays)
    prophet_models[metric] = model

## Section 5: Generate Forecasts

Generate validation forecasts and future forecasts (to Dec 2026).

In [None]:
# Generate forecasts
prophet_forecasts = {}
future_forecasts = {}

# Forecast horizon: 18 months (6 validation + 12 future)
forecast_horizon = 18

for metric, model in prophet_models.items():
    # Create future dataframe
    future = model.make_future_dataframe(periods=forecast_horizon, freq='MS')  # Month start
    
    # Generate forecast
    forecast = model.predict(future)
    
    # Extract validation period forecast
    val_forecast = forecast[
        (forecast['ds'] >= val_start) & (forecast['ds'] <= val_end)
    ]['yhat'].values
    
    prophet_forecasts[metric] = val_forecast
    future_forecasts[metric] = forecast
    
    print(f"\n{metric}:")
    print(f"  Validation forecast: {len(val_forecast)} months")
    print(f"  Full forecast: {len(forecast)} months")

## Section 6: Model Evaluation

Calculate performance metrics on validation period.

In [None]:
def calculate_metrics(y_true, y_pred, model_name, metric_name):
    """Calculate forecast accuracy metrics."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    
    return {
        'model': model_name,
        'metric': metric_name,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape
    }

# Calculate metrics
results = []

for metric in target_metrics:
    y_true = val_df[metric].values
    y_pred = prophet_forecasts[metric]
    
    metrics = calculate_metrics(y_true, y_pred, 'Prophet', metric)
    results.append(metrics)

results_df = pd.DataFrame(results)

print("\nProphet Model Performance (Validation Period):")
print("="*80)
print(results_df.to_string(index=False))

# Compare with baseline (load from previous notebook)
try:
    baseline_df = pd.read_csv('../data/processed/baseline_metrics.csv')
    
    print("\n" + "="*80)
    print("Prophet vs Best Baseline:")
    print("="*80)
    
    for metric in target_metrics:
        prophet_mape = results_df[results_df['metric'] == metric]['MAPE'].values[0]
        baseline_best_mape = baseline_df[baseline_df['metric'] == metric]['MAPE'].min()
        
        improvement = ((baseline_best_mape - prophet_mape) / baseline_best_mape) * 100
        
        print(f"\n{metric}:")
        print(f"  Prophet MAPE: {prophet_mape:.2f}%")
        print(f"  Best Baseline MAPE: {baseline_best_mape:.2f}%")
        print(f"  Improvement: {improvement:+.1f}%")
except:
    print("\n⚠️  Baseline metrics not found. Run notebook 09 first.")

## Section 7: Forecast Visualization

Plot Prophet forecasts with confidence intervals.

In [None]:
# Visualize forecasts
for metric in target_metrics:
    forecast_df = future_forecasts[metric]
    
    fig = go.Figure()
    
    # Historical training data
    fig.add_trace(
        go.Scatter(
            x=train_df['date'],
            y=train_df[metric],
            mode='lines+markers',
            name='Historical (Training)',
            line=dict(color='black', width=2)
        )
    )
    
    # Actual validation values
    fig.add_trace(
        go.Scatter(
            x=val_df['date'],
            y=val_df[metric],
            mode='lines+markers',
            name='Actual (Validation)',
            line=dict(color='green', width=3)
        )
    )
    
    # Prophet forecast
    fig.add_trace(
        go.Scatter(
            x=forecast_df['ds'],
            y=forecast_df['yhat'],
            mode='lines',
            name='Prophet Forecast',
            line=dict(color='blue', width=2, dash='dash')
        )
    )
    
    # Confidence interval (80%)
    fig.add_trace(
        go.Scatter(
            x=forecast_df['ds'],
            y=forecast_df['yhat_upper'],
            mode='lines',
            name='Upper 80% CI',
            line=dict(width=0),
            showlegend=False
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=forecast_df['ds'],
            y=forecast_df['yhat_lower'],
            mode='lines',
            name='Lower 80% CI',
            fill='tonexty',
            fillcolor='rgba(0, 100, 255, 0.2)',
            line=dict(width=0),
            showlegend=False
        )
    )
    
    fig.update_layout(
        title=f"Prophet Forecast - {metric.replace('_', ' ').title()}",
        xaxis_title="Date",
        yaxis_title=metric.replace('_', ' ').title(),
        height=600,
        hovermode='x unified'
    )
    
    fig.show()
    
    # Save
    results_dir = Path('../results')
    results_dir.mkdir(exist_ok=True)
    fig.write_html(results_dir / f'prophet_forecast_{metric}.html')
    print(f"\n✓ Saved: results/prophet_forecast_{metric}.html")

## Section 8: Component Analysis

Decompose forecast into trend, seasonality, and holiday effects.

In [None]:
# Plot components for each metric
for metric in target_metrics:
    model = prophet_models[metric]
    forecast_df = future_forecasts[metric]
    
    print(f"\nComponent Analysis - {metric}:")
    print("="*50)
    
    # Create component plot
    fig = model.plot_components(forecast_df)
    fig.savefig(f'../results/prophet_components_{metric}.png', dpi=150, bbox_inches='tight')
    print(f"✓ Saved component plot: results/prophet_components_{metric}.png")
    
    # Analyze trend
    train_forecast = forecast_df[forecast_df['ds'] <= train_end]
    trend_start = train_forecast['trend'].iloc[0]
    trend_end = train_forecast['trend'].iloc[-1]
    trend_change = ((trend_end - trend_start) / trend_start) * 100
    
    print(f"  Trend (2022-2024): {trend_change:+.1f}%")
    
    # Identify strongest seasonal component
    if 'yearly' in forecast_df.columns:
        yearly_range = forecast_df['yearly'].max() - forecast_df['yearly'].min()
        print(f"  Yearly seasonality range: {yearly_range:.0f}")

## Section 9: Save Results

In [None]:
# Save performance metrics
output_dir = Path('../data/processed')
results_df.to_csv(output_dir / 'prophet_metrics.csv', index=False)
print(f"✓ Saved metrics: data/processed/prophet_metrics.csv")

# Save validation forecasts
forecast_output = pd.DataFrame({
    'date': val_df['date'],
    'year_month': val_df['year_month'].astype(str)
})

for metric in target_metrics:
    forecast_output[metric] = prophet_forecasts[metric]

forecast_output.to_csv(output_dir / 'prophet_forecast_validation.csv', index=False)
print(f"✓ Saved validation forecasts: data/processed/prophet_forecast_validation.csv")

# Save future forecasts (Jul 2025 - Dec 2026)
for metric in target_metrics:
    forecast_df = future_forecasts[metric]
    future_only = forecast_df[forecast_df['ds'] > val_end][['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    future_only.columns = ['date', metric, f'{metric}_lower', f'{metric}_upper']
    
    if metric == target_metrics[0]:
        future_output = future_only
    else:
        future_output = future_output.merge(future_only, on='date')

future_output.to_csv(output_dir / 'prophet_forecast_future.csv', index=False)
print(f"✓ Saved future forecasts: data/processed/prophet_forecast_future.csv")

print(f"\n{'='*80}")
print(f"PROPHET MODEL COMPLETE!")
print(f"{'='*80}")
print(f"\nKey Findings:")
for metric in target_metrics:
    mape = results_df[results_df['metric'] == metric]['MAPE'].values[0]
    print(f"  • {metric}: MAPE = {mape:.2f}%")
print(f"\nNext: Run notebook 11 for SARIMAX model")