# TSA Chapter 10: SARIMA Model Selection

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/TSA/blob/main/TSA_ch10_sarima_model_selection/TSA_ch10_sarima_model_selection.ipynb)

This notebook demonstrates:
- Comparing multiple ARIMA/SARIMA specifications via validation RMSE to select the best model for unemployment forecasting.

In [None]:
!pip install pandas-datareader statsmodels matplotlib numpy pandas -q

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas_datareader as pdr
import os, warnings
warnings.filterwarnings('ignore')

In [None]:
# Color scheme and style configuration
COLORS = {
    'blue': '#1A3A6E', 'red': '#DC3545', 'green': '#2E7D32',
    'orange': '#E67E22', 'gray': '#666666', 'purple': '#8E44AD',
}

plt.rcParams.update({
    'figure.facecolor': 'none', 'axes.facecolor': 'none',
    'savefig.facecolor': 'none', 'savefig.transparent': True,
    'axes.spines.top': False, 'axes.spines.right': False,
    'axes.grid': False, 'font.size': 10, 'axes.titlesize': 12,
    'axes.labelsize': 10, 'xtick.labelsize': 9, 'ytick.labelsize': 9,
    'legend.fontsize': 9, 'figure.dpi': 150, 'lines.linewidth': 1.2,
    'axes.linewidth': 0.6, 'legend.facecolor': 'none',
    'legend.framealpha': 0, 'legend.edgecolor': 'none',
})

def save_chart(fig, name):
    fig.savefig(f'{name}.pdf', bbox_inches='tight', transparent=True, dpi=150)
    fig.savefig(f'{name}.png', bbox_inches='tight', transparent=True, dpi=150)
    try:
        charts_path = os.path.join('..', '..', '..', 'charts', name)
        fig.savefig(f'{charts_path}.pdf', bbox_inches='tight', transparent=True, dpi=150)
        fig.savefig(f'{charts_path}.png', bbox_inches='tight', transparent=True, dpi=150)
    except Exception:
        pass
    print(f'Saved: {name}.pdf + .png')

def legend_outside(ax, ncol=3, y=-0.18):
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, y), ncol=ncol, frameon=False)

In [None]:
# Download unemployment data
unemp = pdr.get_data_fred('UNRATE', start='2010-01-01', end='2025-01-15')
unemp_series = unemp['UNRATE']
print(f'Data: {len(unemp_series)} observations')

# 70% / 20% / 10% split
train_end = '2020-06-01'
val_start = '2020-07-01'
val_end = '2023-06-01'
test_start = '2023-07-01'

train_data = unemp_series[unemp_series.index <= train_end]
val_data = unemp_series[(unemp_series.index >= val_start) & (unemp_series.index <= val_end)]
test_data = unemp_series[unemp_series.index >= test_start]
train_val_data = unemp_series[unemp_series.index <= val_end]

print(f'Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}')

# Compare multiple ARIMA/SARIMA models
models_to_try = [
    ((1,1,0), (0,0,0,0), 'ARIMA(1,1,0)'),
    ((1,1,1), (0,0,0,0), 'ARIMA(1,1,1)'),
    ((2,1,1), (0,0,0,0), 'ARIMA(2,1,1)'),
    ((2,1,2), (0,0,0,0), 'ARIMA(2,1,2)'),
    ((1,1,1), (1,0,0,12), 'SARIMA(1,1,1)(1,0,0)'),
    ((1,1,1), (1,0,1,12), 'SARIMA(1,1,1)(1,0,1)'),
]

results_list = []
for order, seasonal, name in models_to_try:
    try:
        if seasonal[3] == 0:
            m = SARIMAX(train_data, order=order, enforce_stationarity=False, enforce_invertibility=False)
        else:
            m = SARIMAX(train_data, order=order, seasonal_order=seasonal,
                        enforce_stationarity=False, enforce_invertibility=False)
        r = m.fit(disp=False)
        val_fc = r.get_forecast(steps=len(val_data))
        val_rmse = np.sqrt(np.mean((val_data.values - val_fc.predicted_mean.values)**2))
        results_list.append({'name': name, 'val_rmse': val_rmse, 'aic': r.aic})
        print(f'{name}: Val RMSE = {val_rmse:.4f}, AIC = {r.aic:.2f}')
    except:
        print(f'{name}: FAILED')

best_idx = np.argmin([r['val_rmse'] for r in results_list])

fig, ax = plt.subplots(figsize=(9, 4))
names = [m['name'] for m in results_list]
rmse_vals = [m['val_rmse'] for m in results_list]
x = np.arange(len(names))
bars = ax.bar(x, rmse_vals, color=COLORS['orange'], alpha=0.8)
bars[best_idx].set_color(COLORS['green'])
ax.set_ylabel('Validation RMSE')
ax.set_title('Model Selection: Fit on Training (70%), Evaluate on Validation (20%)',
             fontweight='bold', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(names, rotation=20, ha='right', fontsize=9)
for i, v in enumerate(rmse_vals):
    ax.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=9)
fig.tight_layout()
save_chart(fig, 'sarima_model_selection')
plt.show()