In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
# get functions from utils.py
from utils import eval_metrics,plot_train_test,train_data_ml
from joblib import dump
import gc
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputRegressor
import glob
import joblib
import re

In [None]:
ari = pd.read_csv("data_ari.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])
#ari = ari.drop(columns=['Unnamed: 0']).reset_index(drop=True)


In [None]:
ili = pd.read_csv("data_ili.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])
ili = ili.drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [None]:
mape_ari = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])
mape_ili = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])

In [None]:
def xgboost_function(train,test,country=None,model_name=None,mape=None):
    X = train.drop(columns=['value','week_mas_1','week_mas_2','week_mas_3'])
    y = train[['value','week_mas_1','week_mas_2','week_mas_3']]

    X_test = test.drop(columns=['value','week_mas_1','week_mas_2','week_mas_3'])
    

    base_model =MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', random_state=2332, n_jobs=-1))

    base_model.fit(X, y)

    importances = np.array([est.feature_importances_ for est in base_model.estimators_])

    mean_importances = importances.mean(axis=0)


    selected_mask = mean_importances >= 0.01
    selected_features = X.columns[selected_mask].tolist()
    X_selected = X[selected_features]
    # 5. Tunning parameters
    param_grid = {
        "estimator__n_estimators": [50, 100, 200],
        "estimator__max_depth": [3, 4, 5, 10],
        "estimator__learning_rate": np.linspace(0.01, 0.1, 10),
        "estimator__subsample": [0.8, 1.0],
        "estimator__colsample_bytree": [0.7, 0.8, 1.0],       # feature subsampling
        "estimator__min_child_weight": [1, 3, 5],              # min data needed in a child
        "estimator__gamma": [0, 0.1, 0.3, 0.5],                # min loss reduction to split
        "estimator__reg_alpha": np.linspace(0, 1, 5),          # L1 regularization
        "estimator__reg_lambda": np.linspace(0, 1, 5),         # L2 regularization
    }
    tscv = TimeSeriesSplit(n_splits=5)

    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_grid,
        n_iter=20,
        cv=tscv,
        n_jobs=-1,
        scoring='neg_mean_absolute_error',
        random_state=2332
    )

    random_search.fit(X_selected, y)

    model_final = random_search.best_estimator_
    model_final.fit(X_selected, y)
    model_final.selected_features_ = selected_features
    model_final.country_ = country
    model_final.model_name_ = model_name

    dump(model_final, f"models_xgboost/xgboost_model_{country}_{model_name}.joblib")
    test_aux = test.copy()
    prediction_columns = [f"prediction_{h+1}_weeks" for h in range(4)]
    preds = model_final.predict(X_test[selected_features])
    test_aux[prediction_columns] = preds
    # Evaluate predictions
    test_aux = test_aux.dropna()
    mae0, rmse0 = eval_metrics(test_aux["value"], test_aux["prediction_1_weeks"])
    mae1, rmse1  = eval_metrics(test_aux["week_mas_1"], test_aux["prediction_2_weeks"])
    mae2, rmse2  = eval_metrics(test_aux["week_mas_2"], test_aux["prediction_3_weeks"])
    mae3, rmse3 = eval_metrics(test_aux["week_mas_3"], test_aux["prediction_4_weeks"])

    mape = pd.concat([
    mape,
    pd.DataFrame([
        [country, model_name, "1_week", mae0, rmse0],
        [country, model_name, "2_week", mae1, rmse1],
        [country, model_name, "3_week", mae2, rmse2],
        [country, model_name, "4_week", mae3, rmse3]
    ], columns=['location', 'model', 'prediction_window', 'mae', 'rmse'])
], ignore_index=True)
    return model_final, selected_features, test_aux,mape


In [None]:
mape_ari = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])
mape_ili = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])

In [None]:
name_ari = ari.location.unique()
name_ili = ili.location.unique()

In [None]:
for i in name_ari:
    print(f"Processing location: {i}")
    train, test = train_data_ml(ari,i, "2023-10-13")
    train = train.drop(columns=['location'])
    test = test.drop(columns=['location'])
    model_final, selected_features, test_aux,mape_ari= xgboost_function(train,test,country =i, model_name='ARI', mape=mape_ari)
    test_aux.to_csv(f'resultados/xgboost/results_xgboost_{i}_ari.csv',index=False,sep=';',decimal=',')
    plot_train_test(train, test_aux,"ARI",i,'XGboost')

In [None]:
mape_ari.to_csv("mape_ari_xgboost.csv",index=False)

In [None]:
for i in name_ili:
    print(f"Processing location: {i}")
    train, test = train_data_ml(ili,i, "2023-10-13")
    train = train.drop(columns=['location'])
    test = test.drop(columns=['location'])
    model_final, selected_features, test_aux,mape_ili= xgboost_function(train,test,country =i, model_name='ILI', mape=mape_ili)
    test_aux.to_csv(f'resultados/xgboost/results_xgboost_{i}_ili.csv',index=False,sep=';',decimal=',')
    plot_train_test(train, test_aux,"ILI",i,'XGboost')

In [None]:
mape_ili.to_csv("mape_ili_xgboost.csv",index=False)

In [None]:
rows = []

for path in glob.glob("models_xgboost/xgboost_model_*_*.joblib"):
    model = joblib.load(path)

    # Extract country and model name from filename if not stored in model
    m = re.search(r"xgboost_model_(.+?)_(.+?)\.joblib$", path)
    country = getattr(model, "country_", m.group(1) if m else "UNK")
    model_name = getattr(model, "model_name_", m.group(2) if m else "UNK")

    # Recover selected features
    if hasattr(model, "selected_features_"):
        feats = model.selected_features_
    else:
        try:
            feats = list(model.estimators_[0].feature_names_in_)
        except Exception:
            raise ValueError(f"No feature names available for {path}. "
                             "Please re-save with .selected_features_")

    # Extract feature importances for each horizon
    for h_idx, est in enumerate(model.estimators_, start=1):
        imps = est.feature_importances_
        rows.extend([
            {"country": country, "model_name": model_name,
             "horizon": h_idx, "feature": f, "importance": imp}
            for f, imp in zip(feats, imps)
        ])

all_imp = pd.DataFrame(rows)
all_imp.to_csv("xgboost_feature_importances.csv", index=False,sep=';',decimal=',')

In [None]:
# Add disease column
def extract_disease(name):
    name = name.lower()
    if "ari" in name:
        return "ARI"
    elif "ili" in name:
        return "ILI"
    else:
        return "UNK"

all_imp["disease"] = all_imp["model_name"].map(extract_disease)
# Compute mean and std importance per feature per disease
feat_stats = (
    all_imp
    .groupby(["disease", "feature"])["importance"]
    .agg(["mean", "std"])
    .reset_index()
)
# Get top 10 features per disease based on mean importance
top10 = (
    feat_stats
    .sort_values(["disease", "mean"], ascending=[True, False])
    .groupby("disease")
    .head(10)
)

print(top10)


In [None]:

# ---- ARI ----
feat_stats_ari = (
    feat_stats[feat_stats["disease"] == "ARI"]
    .sort_values("mean", ascending=True)
    .tail(10)   # top 10
)

plt.figure(figsize=(8,6))
plt.barh(feat_stats_ari["feature"], feat_stats_ari["mean"], xerr=feat_stats_ari["std"])
plt.xlabel("Mean Importance")
plt.title("Top 10 Gradient Boosting Features – ARI")
plt.tight_layout()
plt.savefig("xgboost_feature_importance_ARI.png", dpi=300)
plt.show()


# ---- ILI ----
feat_stats_ili = (
    feat_stats[feat_stats["disease"] == "ILI"]
    .sort_values("mean", ascending=True)
    .tail(10)   # top 10
)

plt.figure(figsize=(8,6))
plt.barh(feat_stats_ili["feature"], feat_stats_ili["mean"], xerr=feat_stats_ili["std"])
plt.xlabel("Mean Importance")
plt.title("Top 10 Gradient Boosting Features – ILI")
plt.tight_layout()
plt.savefig("xgboost_feature_importance_ILI.png", dpi=300)
plt.show()
