# Regression baseline (forecast PM2.5(t+h))

- Tạo target: `pm2.5_target = pm2.5(t+horizon)`
- Lag features: [1,3,24] (mặc định)
- Train/test split theo cutoff để tránh leakage
- Train baseline regressor + metrics (MAE/RMSE)


In [None]:
# Parameters (Papermill)
cleaned_path = "data/processed/cleaned.parquet"
station = "Aotizhongxin"
cutoff = "2017-01-01"
horizon = 1
lag_hours = [1, 3, 24]
output_metrics_path = "data/processed/regression_metrics.json"
output_predictions_path = "data/processed/regression_predictions.parquet"


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from src.classification_library import load_parquet
from src.regression_library import RegressionConfig, prepare_regression_frame, time_split, train_baseline_rf, predict_and_eval, save_metrics


In [None]:
df = load_parquet(cleaned_path)
df = df[df["station"]==station].copy()
cfg = RegressionConfig(cutoff=cutoff, horizon=horizon, lag_hours=tuple(lag_hours))

frame, feat_cols = prepare_regression_frame(df, cfg)
train, test = time_split(frame, cutoff)

model = train_baseline_rf(train, feat_cols)
metrics, pred = predict_and_eval(model, test, feat_cols)

metrics.update({
    "station": station,
    "cutoff": cutoff,
    "horizon": horizon,
    "lag_hours": lag_hours,
})
save_metrics(metrics, output_metrics_path)
metrics


In [None]:
# Save predictions (optional)
out = test[["datetime","pm25_target"]].copy()
out["pred"] = pred
out.to_parquet(output_predictions_path, index=False)

# Quick plot (first 2 weeks)
tmp = out.set_index("datetime").sort_index().iloc[:24*14]
ax = tmp["pm25_target"].plot(figsize=(12,3), label="actual", title="Regression: Forecast vs Actual (first 2 weeks)")
tmp["pred"].plot(ax=ax, label="pred")
ax.legend()
plt.show()
