In [1]:
import os
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import xgboost as xgb

# 1oad & prepare data
DATA_PATH = "../data/agrimonia_daily.csv"
df = pd.read_csv(DATA_PATH, parse_dates=["date"])
df = df.sort_values("date").set_index("date")

# feature engineering
# time features
df["dayofyear"] = df.index.dayofyear
df["month"]     = df.index.month
df["weekday"]   = df.index.weekday

# lag features (1, 2, 3 days)
for lag in [1, 2, 3]:
    df[f"pm25_lag{lag}"] = df["pm25"].shift(lag)

# rolling means (7-day, 14-day)
df["pm25_roll7"]  = df["pm25"].rolling(7).mean()
df["pm25_roll14"] = df["pm25"].rolling(14).mean()

# drop NaNs from lags/rollings
df = df.dropna()

# train/test split
y = df["pm25"]
X = df.drop(columns="pm25")

# use last 20% of time for test
split_idx = int(len(df)*0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# hyperparameter tuning with GridSearchCV
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth":    [3, 5, 7],
    "learning_rate":[0.01, 0.1, 0.2],
    "subsample":    [0.7, 1.0],
}

grid = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)

# evaluate on test set
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2   = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²:   {r2:.2f}")

# save model & metrics
os.makedirs("../models", exist_ok=True)
MODEL_PATH  = "../models/xgb_agrimonia_daily.joblib"
METRICS_PATH = "../data/metrics_agrimonia_daily.csv"

joblib.dump(best_model, MODEL_PATH)
pd.DataFrame([{"rmse": rmse, "r2": r2}]).to_csv(METRICS_PATH, index=False)

print(f"Model saved to {MODEL_PATH}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
Test RMSE: 2.71
Test R²:   0.95
Model saved to ../models/xgb_agrimonia_daily.joblib


