# ARIMA forecasting (1-step ahead)

- Chuẩn bị chuỗi pm2.5 hourly
- ADF/KPSS quyết định d
- ACF/PACF hỗ trợ chọn p,q (trực quan)
- Grid search p,q nhỏ theo AIC
- Rolling 1-step forecast trên test + MAE/RMSE
- Vẽ Forecast vs Actual + ACF của residual


In [None]:
# Parameters (Papermill)
cleaned_path = "data/processed/cleaned.parquet"
station = "Aotizhongxin"
cutoff = "2017-01-01"
horizon = 1
p_max = 3
q_max = 3
output_metrics_path = "data/processed/arima_metrics.json"
output_predictions_path = "data/processed/arima_predictions.parquet"


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from src.classification_library import load_parquet
from src.timeseries_library import choose_d_from_tests, grid_search_arima, rolling_forecast_1step, eval_forecast, save_metrics


In [None]:
df = load_parquet(cleaned_path)
d = df[df["station"]==station].copy()
d["datetime"] = pd.to_datetime(d["datetime"])
s = d.sort_values("datetime").set_index("datetime")["pm2.5"].asfreq("H")

# nhẹ nhàng xử lý missing để ARIMA fit ổn hơn
s = s.interpolate(limit=3)

train = s.loc[:cutoff].dropna()
test = s.loc[cutoff:].dropna()

print("Train points:", len(train), "Test points:", len(test))


In [None]:
# Decide d
d_chosen, d_logs = choose_d_from_tests(train, max_d=2)
d_chosen, d_logs


In [None]:
# ACF/PACF for intuition
plot_acf(train, lags=60)
plt.show()
plot_pacf(train, lags=60, method="ywm")
plt.show()


In [None]:
best = grid_search_arima(train, d=d_chosen, p_max=p_max, q_max=q_max)
best["order"], best["aic"]


In [None]:
pred = rolling_forecast_1step(train, test, order=best["order"])
metrics = eval_forecast(test, pred)
metrics.update({
    "station": station,
    "cutoff": cutoff,
    "horizon": horizon,
    "order": best["order"],
    "aic": best["aic"],
    "d_decision_log": d_logs,
})
save_metrics(metrics, output_metrics_path)
metrics


In [None]:
# Save predictions (optional)
out = pd.DataFrame({"datetime": pred.index, "actual": test.loc[pred.index].values, "forecast": pred.values})
out.to_parquet(output_predictions_path, index=False)

# Forecast vs Actual (first 2 weeks)
tmp = out.set_index("datetime").sort_index().iloc[:24*14]
ax = tmp["actual"].plot(figsize=(12,3), label="actual", title="ARIMA: Forecast vs Actual (first 2 weeks)")
tmp["forecast"].plot(ax=ax, label="forecast")
ax.legend()
plt.show()


In [None]:
# Residual ACF (using in-sample residual of best model on train)
resid = best["model"].resid.dropna()
plot_acf(resid, lags=60)
plt.show()
