# 03 - Price Baseline Models

## Objective
Establish baseline performance using simple forecasting methods.

**Models:**
1. Naive (last observation)
2. Seasonal Naive (same hour yesterday)
3. Moving Average (24h)
4. Drift (linear trend)
5. Mean (historical average)

**Goal:** These baselines set the minimum performance threshold. 
Any advanced model must beat these simple approaches.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 1. Load Processed Data

In [None]:
# Load processed datasets
data_dir = Path('../../data/processed')

train = pd.read_csv(data_dir / 'price_train.csv', index_col=0, parse_dates=True)
val = pd.read_csv(data_dir / 'price_val.csv', index_col=0, parse_dates=True)
test = pd.read_csv(data_dir / 'price_test.csv', index_col=0, parse_dates=True)

print(f"Train: {train.shape}")
print(f"Val:   {val.shape}")
print(f"Test:  {test.shape}")

# Extract target variable
y_train = train['price']
y_val = val['price']
y_test = test['price']

print(f"\nTarget variable ranges:")
print(f"Train: [{y_train.min():.2f}, {y_train.max():.2f}]")
print(f"Val:   [{y_val.min():.2f}, {y_val.max():.2f}]")
print(f"Test:  [{y_test.min():.2f}, {y_test.max():.2f}]")

## 2. Baseline Models Implementation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true.abs() + 1e-8))) * 100
    
    return {
        'Model': model_name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R¬≤': r2,
        'MAPE': mape
    }

### 2.1 Naive Model (Last Observation)

In [None]:
# Naive forecast: t+1 = t
naive_test = y_train.iloc[-1] * np.ones(len(y_test))
naive_results = evaluate_model(y_test, naive_test, 'Naive')

print("Naive Model Results:")
print(f"  R¬≤: {naive_results['R¬≤']:.4f}")
print(f"  RMSE: {naive_results['RMSE']:.2f}")
print(f"  MAE: {naive_results['MAE']:.2f}")

### 2.2 Seasonal Naive (Same Hour Yesterday)

In [None]:
# Seasonal Naive: Use same hour from 24 hours ago
# For test set, use last 24 hours from train
last_24_train = y_train.iloc[-24:].values
seasonal_naive_test = np.tile(last_24_train, int(np.ceil(len(y_test)/24)))[:len(y_test)]

seasonal_naive_results = evaluate_model(y_test, seasonal_naive_test, 'Seasonal Naive (24h)')

print("Seasonal Naive Model Results:")
print(f"  R¬≤: {seasonal_naive_results['R¬≤']:.4f}")
print(f"  RMSE: {seasonal_naive_results['RMSE']:.2f}")
print(f"  MAE: {seasonal_naive_results['MAE']:.2f}")

### 2.3 Moving Average (24-hour window)

In [None]:
# Moving Average: Mean of last 24 hours
ma_24_value = y_train.iloc[-24:].mean()
ma_test = ma_24_value * np.ones(len(y_test))

ma_results = evaluate_model(y_test, ma_test, 'Moving Average (24h)')

print("Moving Average Model Results:")
print(f"  R¬≤: {ma_results['R¬≤']:.4f}")
print(f"  RMSE: {ma_results['RMSE']:.2f}")
print(f"  MAE: {ma_results['MAE']:.2f}")

### 2.4 Drift Model

In [None]:
# Drift: Linear trend from first to last observation
drift_slope = (y_train.iloc[-1] - y_train.iloc[0]) / (len(y_train) - 1)
drift_test = y_train.iloc[-1] + drift_slope * np.arange(1, len(y_test) + 1)

drift_results = evaluate_model(y_test, drift_test, 'Drift')

print("Drift Model Results:")
print(f"  R¬≤: {drift_results['R¬≤']:.4f}")
print(f"  RMSE: {drift_results['RMSE']:.2f}")
print(f"  MAE: {drift_results['MAE']:.2f}")

### 2.5 Mean Model

In [None]:
# Mean: Historical average
mean_value = y_train.mean()
mean_test = mean_value * np.ones(len(y_test))

mean_results = evaluate_model(y_test, mean_test, 'Mean')

print("Mean Model Results:")
print(f"  R¬≤: {mean_results['R¬≤']:.4f}")
print(f"  RMSE: {mean_results['RMSE']:.2f}")
print(f"  MAE: {mean_results['MAE']:.2f}")

## 3. Results Comparison

In [None]:
# Compile results
results_df = pd.DataFrame([
    naive_results,
    seasonal_naive_results,
    ma_results,
    drift_results,
    mean_results
])

# Sort by R¬≤
results_df = results_df.sort_values('R¬≤', ascending=False)

print("\n" + "="*80)
print("BASELINE MODELS COMPARISON")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# R¬≤
axes[0, 0].barh(results_df['Model'], results_df['R¬≤'], color='steelblue', edgecolor='black')
axes[0, 0].set_xlabel('R¬≤ Score')
axes[0, 0].set_title('R¬≤ Score by Model', fontweight='bold')
axes[0, 0].grid(alpha=0.3, axis='x')

# RMSE
axes[0, 1].barh(results_df['Model'], results_df['RMSE'], color='coral', edgecolor='black')
axes[0, 1].set_xlabel('RMSE')
axes[0, 1].set_title('RMSE by Model', fontweight='bold')
axes[0, 1].grid(alpha=0.3, axis='x')

# MAE
axes[1, 0].barh(results_df['Model'], results_df['MAE'], color='seagreen', edgecolor='black')
axes[1, 0].set_xlabel('MAE')
axes[1, 0].set_title('MAE by Model', fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='x')

# MAPE
axes[1, 1].barh(results_df['Model'], results_df['MAPE'], color='purple', edgecolor='black')
axes[1, 1].set_xlabel('MAPE (%)')
axes[1, 1].set_title('MAPE by Model', fontweight='bold')
axes[1, 1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('../../results/figures/price_baseline_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Forecast Visualization

In [None]:
# Plot forecasts (first 7 days of test)
plot_days = 7
plot_hours = plot_days * 24

fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(y_test.index[:plot_hours], y_test.values[:plot_hours], 
        linewidth=2, label='Actual', color='black', zorder=5)
ax.plot(y_test.index[:plot_hours], naive_test[:plot_hours], 
        linewidth=1.5, label='Naive', alpha=0.7, linestyle='--')
ax.plot(y_test.index[:plot_hours], seasonal_naive_test[:plot_hours], 
        linewidth=1.5, label='Seasonal Naive (24h)', alpha=0.7, linestyle='--')
ax.plot(y_test.index[:plot_hours], ma_test[:plot_hours], 
        linewidth=1.5, label='Moving Average (24h)', alpha=0.7, linestyle='--')
ax.axhline(0, color='red', linestyle='-', linewidth=1)
ax.set_title(f'Baseline Models Forecast - First {plot_days} Days', fontweight='bold', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Price (EUR/MWh)')
ax.legend(loc='best')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../../results/figures/price_baseline_forecast.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Error Analysis

In [None]:
# Best baseline model
best_model_name = results_df.iloc[0]['Model']
best_r2 = results_df.iloc[0]['R¬≤']

# Get predictions for best model
if best_model_name == 'Naive':
    best_pred = naive_test
elif best_model_name == 'Seasonal Naive (24h)':
    best_pred = seasonal_naive_test
elif best_model_name == 'Moving Average (24h)':
    best_pred = ma_test
elif best_model_name == 'Drift':
    best_pred = drift_test
else:
    best_pred = mean_test

errors = y_test.values - best_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Error distribution
axes[0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(0, color='red', linestyle='--', linewidth=2)
axes[0].set_title(f'{best_model_name} - Error Distribution', fontweight='bold')
axes[0].set_xlabel('Error (EUR/MWh)')
axes[0].set_ylabel('Frequency')
axes[0].grid(alpha=0.3)

# Error over time
axes[1].plot(y_test.index, errors, linewidth=0.5, alpha=0.7)
axes[1].axhline(0, color='red', linestyle='--', linewidth=2)
axes[1].set_title(f'{best_model_name} - Error Over Time', fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Error (EUR/MWh)')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../../results/figures/price_baseline_errors.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nError statistics for {best_model_name}:")
print(f"Mean error: {errors.mean():.2f}")
print(f"Std error: {errors.std():.2f}")
print(f"Min error: {errors.min():.2f}")
print(f"Max error: {errors.max():.2f}")

## 6. Save Results

In [None]:
# Save metrics
results_dir = Path('../../results/metrics')
results_dir.mkdir(parents=True, exist_ok=True)

results_df.to_csv(results_dir / 'price_baseline_metrics.csv', index=False)
print(f"\n‚úÖ Results saved to {results_dir / 'price_baseline_metrics.csv'}")

## 7. Summary

In [None]:
print("="*80)
print("üìã PRICE BASELINE MODELS - SUMMARY")
print("="*80)

print(f"\nüèÜ BEST BASELINE MODEL: {best_model_name}")
print(f"   R¬≤: {best_r2:.4f}")
print(f"   RMSE: {results_df.iloc[0]['RMSE']:.2f} EUR/MWh")
print(f"   MAE: {results_df.iloc[0]['MAE']:.2f} EUR/MWh")

print(f"\nüìä ALL MODELS:")
for _, row in results_df.iterrows():
    print(f"   {row['Model']:25s} R¬≤={row['R¬≤']:7.4f}  RMSE={row['RMSE']:6.2f}")

print(f"\nüéØ BASELINE THRESHOLD:")
print(f"   Advanced models must achieve R¬≤ > {best_r2:.4f}")
print(f"   Expected R¬≤ for best models: 0.85 - 0.92 (price is challenging!)")

print("\n" + "="*80)
print("‚úÖ Baseline models complete! Ready for statistical models.")
print("="*80)

## Next Steps

1. ‚úÖ Data exploration
2. ‚úÖ Data preprocessing  
3. ‚úÖ Baseline models
4. ‚û°Ô∏è **Next:** `04_price_statistical_models.ipynb`
   - SARIMA
   - ETS (Exponential Smoothing)
5. üìä Then: ML tree models and deep learning