# Imports and configs

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from random import gauss
from random import seed
from pandas import Series
from pandas.plotting import autocorrelation_plot
from matplotlib import pyplot
from plotly.subplots import make_subplots
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA


In [None]:
airline_passengers_data_path = "https://storage.googleapis.com/edulabs-public-datasets/airline-passengers.csv"

# Load Data

- Load passengers data as series with datetime index
- set frequency to the index
- check for NaNs and perform interpolation if needed

# Perform hold-out split

- All the data up December 1957 will be for train
- The rest will be for testing

# Find ARIMA p, q, d orders

- Use plots and statistical tests to find p, q, d

# Find Seasonal P, Q, D orders

### Identify the **seasonal period(s)**

Ask: *how often do patterns repeat*?

| Data Type                        | Likely `s` |
| -------------------------------- | ---------- |
| Daily data, weekly seasonality   | 7          |
| Monthly data, yearly seasonality | 12         |
| Hourly data, daily seasonality   | 24         |

If unsure, use **ACF** to look for strong spikes at lags like 7, 12, etc.

---

### Seasonal **differencing (`D`)**

Check if seasonal differencing is needed:

```python
series.diff(s).plot()  # visually
```

Or apply ADF test on `series.diff(s)`:

* If non-stationary → try `D=1`
* If stationary → `D=0`

**Typical choice: `D=1`** if there's seasonal trend.**

### Use **seasonal ACF/PACF** to estimate `P` and `Q`

After differencing (if needed), plot ACF/PACF of `seasonally differenced` data:

```python
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(series.diff(s).dropna(), lags=40)
plot_pacf(series.diff(s).dropna(), lags=40)
```

Look for **spikes at seasonal lags** (e.g., 12, 24):

* A **PACF spike at lag `s`** → seasonal AR(1) → `P=1`
* An **ACF spike at lag `s`** → seasonal MA(1) → `Q=1`


### Try combinations and compare AIC/BIC

- Try SARIMAX model
- Use TimeSeriesSplitCV on train data and Grid Search to find the best combination of parameters

```python
SARIMAX(series, order=(p,d,q), seasonal_order=(P,D,Q,s))
```


### 📌 Typical seasonal orders:

| Data                            | Common `(P,D,Q,s)` |
| ------------------------------- | ------------------ |
| Monthly with yearly seasonality | (1,1,1,12)         |
| Daily with weekly seasonality   | (1,1,1,7)          |

---

# Create rolling forecast

- Implement rolling forecast on hold-out (test) data
- Plot actual values vs predictions
- Plot residuals
- Calculate MAE + MAPE for test set

