In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
# get functions from utils.py
from utils import eval_metrics,plot_train_test,train_data_ml
from joblib import dump
import gc
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputRegressor
import glob
import joblib
import re

In [None]:
ari = pd.read_csv("data_ari.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])

In [None]:
ili = pd.read_csv("data_ili.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])
ili = ili.drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [None]:
mape_ari = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])
mape_ili = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])

In [None]:
def rf_variable_selection_and_hyperparam_tuning(train,test,country =None, model_name=None, mape=None):
    X = train.drop(columns=['value','week_mas_1','week_mas_2','week_mas_3'])
    y = train[['value','week_mas_1','week_mas_2','week_mas_3']]

    X_test = test.drop(columns=['value','week_mas_1','week_mas_2','week_mas_3'])
    y_test= test[['value','week_mas_1','week_mas_2','week_mas_3']]
    

    base_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=2332))
    base_rf.fit(X, y)

    importances = np.array([est.feature_importances_ for est in base_rf.estimators_])

    mean_importances = importances.mean(axis=0)


    selected_mask = mean_importances >= 0.01
    selected_features = X.columns[selected_mask].tolist()
    X_selected = X[selected_features]

    # Step 3: Hyperparameter tuning with RandomizedSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [5,10, 20, 30, None],
        'min_samples_split': [2,4, 5, 10],
        'min_samples_leaf': [1, 2, 4,5],
        'max_features': ['sqrt', 'log2']
    }

    tscv = TimeSeriesSplit(n_splits=5)
    rf = RandomForestRegressor(random_state=2332)

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_grid,
        n_iter=20,
        cv=tscv,
        n_jobs=-1,
        scoring='neg_mean_absolute_error',
        random_state=2332
    )

    random_search.fit(X_selected, y)

    best_model = random_search.best_estimator_
    model_final = MultiOutputRegressor(best_model)
    model_final.fit(X_selected, y)
    model_final.selected_features_ = selected_features
    model_final.country_ = country
    model_final.model_name_ = model_name

    dump(model_final, f"models_rf/rf_model_{country}_{model_name}.joblib")
    test_aux = test.copy()
    prediction_columns = [f"prediction_{h+1}_weeks" for h in range(4)]
    preds = model_final.predict(X_test[selected_features])
    test_aux[prediction_columns] = preds
    pred_df = pd.DataFrame(preds, index=X_test.index, columns=prediction_columns)
    pred_df['value'] = y_test['value'].values
    print(pred_df.head())
    # Evaluate predictions
    test_aux = test_aux.dropna()
    mae0, rmse0 = eval_metrics(test_aux["value"], test_aux["prediction_1_weeks"])
    mae1, rmse1  = eval_metrics(test_aux["week_mas_1"], test_aux["prediction_2_weeks"])
    mae2, rmse2  = eval_metrics(test_aux["week_mas_2"], test_aux["prediction_3_weeks"])
    mae3, rmse3 = eval_metrics(test_aux["week_mas_3"], test_aux["prediction_4_weeks"])

    mape = pd.concat([
    mape,
    pd.DataFrame([
        [country, model_name, "1_week", mae0, rmse0],
        [country, model_name, "2_week", mae1, rmse1],
        [country, model_name, "3_week", mae2, rmse2],
        [country, model_name, "4_week", mae3, rmse3]
    ], columns=['location', 'model', 'prediction_window', 'mae', 'rmse'])
], ignore_index=True)
    return model_final, selected_features, pred_df,mape


In [None]:
mape_ari = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])
mape_ili = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])

In [None]:
name_ari = ari.location.unique()
name_ili = ili.location.unique()

In [None]:
for i in name_ari:
    print(f"Processing location: {i}")
    train, test = train_data_ml(ari,i, "2023-10-13")
    train = train.drop(columns=['location'])
    test = test.drop(columns=['location'])
    model_final, selected_features, test_aux,mape_ari= rf_variable_selection_and_hyperparam_tuning(train,test,country =i, model_name='ARI', mape=mape_ari)
    test_aux.to_csv(f'results_rf_{i}_ari.csv',index=False)
    plot_train_test(train, test_aux,"ARI",i,'RF')

In [None]:
mape_ari.to_csv("mape_ari_rf.csv",index=False)

In [None]:
mape_ari

In [None]:
for i in name_ili:
    print(f"Processing location: {i}")
    train, test = train_data_ml(ili,i, "2023-10-13")
    train = train.drop(columns=['location'])
    test = test.drop(columns=['location'])
    model_final, selected_features, test_aux,mape_ili= rf_variable_selection_and_hyperparam_tuning(train,test,country =i, model_name='ILI', mape=mape_ili)
    test_aux.to_csv(f'results_rf_{i}_ili.csv',index=False)
    plot_train_test(train, test_aux,"ILI",i,'RF')

In [None]:
mape_ili

In [None]:
mape_ili.to_csv("mape_ili_rf.csv",index=False)

In [None]:
rows = []

for path in glob.glob("models_rf/rf_model_*_*.joblib"):
    model = joblib.load(path)

    # Extract country and model name from filename if not stored in model
    m = re.search(r"rf_model_(.+?)_(.+?)\.joblib$", path)
    country = getattr(model, "country_", m.group(1) if m else "UNK")
    model_name = getattr(model, "model_name_", m.group(2) if m else "UNK")

    # Recover selected features
    if hasattr(model, "selected_features_"):
        feats = model.selected_features_
    else:
        try:
            feats = list(model.estimators_[0].feature_names_in_)
        except Exception:
            raise ValueError(f"No feature names available for {path}. "
                             "Please re-save with .selected_features_")

    # Extract feature importances for each horizon
    for h_idx, est in enumerate(model.estimators_, start=1):
        imps = est.feature_importances_
        rows.extend([
            {"country": country, "model_name": model_name,
             "horizon": h_idx, "feature": f, "importance": imp}
            for f, imp in zip(feats, imps)
        ])

all_imp = pd.DataFrame(rows)
all_imp.to_csv("rf_feature_importances.csv", index=False)

In [None]:
def extract_disease(name):
    name = name.lower()
    if "ari" in name:
        return "ARI"
    elif "ili" in name:
        return "ILI"
    else:
        return "UNK"

all_imp["disease"] = all_imp["model_name"].map(extract_disease)
# Compute mean and std importance per feature per disease
feat_stats = (
    all_imp
    .groupby(["disease", "feature"])["importance"]
    .agg(["mean", "std"])
    .reset_index()
)

# Sort within each disease and select top 10
top10 = (
    feat_stats
    .sort_values(["disease", "mean"], ascending=[True, False])
    .groupby("disease")
    .head(10)
)

print(top10)


In [None]:
import matplotlib.pyplot as plt

# ---- ARI ----
feat_stats_ari = (
    feat_stats[feat_stats["disease"] == "ARI"]
    .sort_values("mean", ascending=True)
    .tail(10)   # top 10
)

plt.figure(figsize=(8,6))
plt.barh(feat_stats_ari["feature"], feat_stats_ari["mean"], xerr=feat_stats_ari["std"])
plt.xlabel("Mean Importance")
plt.title("Top 10 Random Forest Features – ARI")
plt.tight_layout()
plt.savefig("rf_feature_importance_ARI.png", dpi=300)
plt.show()


# ---- ILI ----
feat_stats_ili = (
    feat_stats[feat_stats["disease"] == "ILI"]
    .sort_values("mean", ascending=True)
    .tail(10)   # top 10
)

plt.figure(figsize=(8,6))
plt.barh(feat_stats_ili["feature"], feat_stats_ili["mean"], xerr=feat_stats_ili["std"])
plt.xlabel("Mean Importance")
plt.title("Top 10 Random Forest Features – ILI")
plt.tight_layout()
plt.savefig("rf_feature_importance_ILI.png", dpi=300)
plt.show()
