In [48]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import pmdarima as pm
from pmdarima import auto_arima,arima
import warnings
# get functions from utils.py
from utils import train_data,eval_metrics,plot_train_test
from statsmodels.tsa.statespace.sarimax import SARIMAX
import joblib
from joblib import dump, load
import gc
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputRegressor

In [49]:
ari = pd.read_csv("data_ari.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])


In [50]:
ili = pd.read_csv("data_ili.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])
ili = ili.drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [51]:
mape_ari = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])
mape_ili = pd.DataFrame(columns=['location','model','prediction_window','mae','rmse'])

In [52]:
def forecast_arima_sarima_model(train, test, mape,model,order_model,seasonal_order_model, model_name="no_model_def", country="no_country_def",exogenous_var=None):
    test_aux = test.copy()

    # Prepare prediction columns
    for h in range(4):
        test_aux[f"prediction_{h+1}_weeks"] = np.nan

    # Rolling forecast
    for i in range(len(test_aux)):
        # Combine train and observed test values so far
        train_series = pd.concat([train["value"], test_aux.iloc[:i]["value"]])
        if exogenous_var is not None:
            exog_train = pd.concat([train[exogenous_var], test_aux.iloc[:i][exogenous_var]])
            exog_forecast = test_aux.iloc[i:i+4][exogenous_var]
        else:
            exog_train = None
            exog_forecast = None        

        # Fit model
        model = SARIMAX(train_series, order=order_model, seasonal_order=seasonal_order_model,exog=exog_train)
        model_fit = model.fit(disp=False)
        
        # Forecast 1 to 4 weeks ahead, or less at the end
        forecast_steps = min(4, len(test_aux) - i)
        if exogenous_var is not None:
            forecast = model_fit.forecast(steps=forecast_steps, exog=exog_forecast.iloc[:forecast_steps])
        else:
            forecast = model_fit.forecast(steps=forecast_steps)
        # Save forecasted values
        #for h, pred in enumerate(forecast):
        #    test_aux.loc[test_aux.index[i + h], f"prediction_{h+1}_weeks"] = pred
        
        for h in range(forecast_steps):
            test_aux.loc[test_aux.index[i + h], f"prediction_{h+1}_weeks"] = forecast.iloc[h]

    # Evaluate predictions
    for h in range(4):
        shifted = test_aux["value"].shift(-h)
        preds = test_aux[f"prediction_{h+1}_weeks"]
        valid_idx = ~shifted.isna()
        y_true = shifted[valid_idx]
        y_pred = preds[valid_idx]
        resid = y_true - y_pred
        residual = y_pred - y_true
        test_aux.loc[valid_idx, f"week_{h+1}_res"] = resid
        mae, rmse = eval_metrics(y_true, y_pred)
        mape = pd.concat([
            mape,
            pd.DataFrame([[country, model_name, f"{h+1}_week", mae, rmse]],
                         columns=['location', 'model', 'prediction_window', 'mae', 'rmse'])
        ], ignore_index=True)

    return mape, test_aux,residual


In [53]:

loaded_model = joblib.load(f'models/arima_model_RO_ILI.joblib')
order_model = loaded_model.order
seasonal_order_model = loaded_model.seasonal_order

In [54]:
data = ili[ili['location']=='RO'].copy()
train, test = train_data(ili,'RO', "2023-10-13")

In [55]:
mape, train,res_1 = forecast_arima_sarima_model(train, train, mape_ili, loaded_model,order_model=order_model,seasonal_order_model=seasonal_order_model, model_name="ARIMA", country="AT")
mape2, test_predictions,res_2 = forecast_arima_sarima_model(train, test, mape_ili, loaded_model,order_model=order_model,seasonal_order_model=seasonal_order_model, model_name="ARIMA", country="AT")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates

In [56]:
train.columns

Index(['location', 'year_week', 'value', 'relative_humidity_2m',
       'temperature_2m_max', 'temperature_2m_min', 'covid',
       'prediction_1_weeks', 'prediction_2_weeks', 'prediction_3_weeks',
       'prediction_4_weeks', 'week_1_res', 'week_2_res', 'week_3_res',
       'week_4_res'],
      dtype='object')

In [57]:
train_2 = train[[ 'year_week', 'relative_humidity_2m',
       'temperature_2m_max', 'temperature_2m_min', 'covid',
       'week_1_res', 'week_2_res', 'week_3_res',
       'week_4_res']]
test_predictions_2 = test_predictions[[ 'year_week', 'relative_humidity_2m',
       'temperature_2m_max', 'temperature_2m_min', 'covid',
       'week_1_res', 'week_2_res', 'week_3_res',
       'week_4_res']]

In [58]:
train_2

Unnamed: 0_level_0,year_week,relative_humidity_2m,temperature_2m_max,temperature_2m_min,covid,week_1_res,week_2_res,week_3_res,week_4_res
truth_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-10-05,2014-W40,60.699980,17.712000,6.162000,0,6.000000e+00,,,
2014-10-12,2014-W41,81.365990,17.758429,9.937001,0,-1.127745e+00,3.200000e+00,,
2014-10-19,2014-W42,81.203125,20.058428,9.922714,0,-5.279594e-01,-1.308115e-01,5.400000e+00,
2014-10-26,2014-W43,80.649730,11.051286,4.651286,0,1.760006e+00,2.091028e+00,1.199240e+00,3.400000e+00
2014-11-02,2014-W44,84.446920,9.065572,2.672714,0,1.316152e+00,2.879510e+00,4.629253e-01,1.869904e+00
...,...,...,...,...,...,...,...,...,...
2023-09-03,2023-W35,59.220367,29.127287,18.948715,1,1.846063e-99,3.728959e-84,2.281421e-67,8.500000e+00
2023-09-17,2023-W37,61.393562,28.041570,16.370144,1,-3.411039e-117,1.642576e-99,8.500000e+00,2.029940e-67
2023-09-24,2023-W38,64.696790,28.627287,16.270144,1,-1.120530e-132,8.500000e+00,1.461518e-99,
2023-10-01,2023-W39,44.371025,26.305859,16.320143,1,8.500000e+00,-9.970186e-133,,


In [59]:
def train_data_ml(df,date):
    df = create_features(df)
    train = df[df.index<=date]
    test = df[df.index>date]
    return train,test

In [60]:
def create_features(data):
    """
    Create additional features for the non sequential dataset.
    """
    data = data.copy()

    # Extract year, month, day, weekday, and week from 'truth_date'
    data['year'] = data.index.year
    data['month'] = data.index.month

    week = data['year_week'].str.split('-W').str[1]
    data['week'] = week.astype(int)
    for h in range(1,5):
        data[f'lag_value_{h}'] = data['week_1_res'].shift(h)
        data[f'lag_humidity_{h}'] = data['relative_humidity_2m'].shift(h)
        data[f'lag_temp_max_{h}'] = data['temperature_2m_max'].shift(h)
        data[f'lag_temp_min_{h}'] = data['temperature_2m_min'].shift(h)
    data = data.dropna()
    # Convert cyclical categorical variables to category type
    data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
    data['month_cos'] = np.cos(2 * np.pi * data['month']/12)
    data['week_sin'] = np.sin(2 * np.pi * data['week']/52)
    data['week_cos'] = np.cos(2 * np.pi * data['week']/52)
    data = data.drop(columns=['month', 'week','year_week'])
    return data

In [61]:
def rf_variable_selection_and_hyperparam_tuning(train,test,country =None, model_name=None, mape=None):
    X = train.drop(columns=['week_1_res','week_2_res','week_3_res','week_4_res'])
    y = train[['week_1_res','week_2_res','week_3_res','week_4_res']]

    X_test = test.drop(columns=['week_1_res','week_2_res','week_3_res','week_4_res'])
    y_test= test[['week_1_res','week_2_res','week_3_res','week_4_res']]
    

    base_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=2332))
    base_rf.fit(X, y)

    importances = np.array([est.feature_importances_ for est in base_rf.estimators_])

    mean_importances = importances.mean(axis=0)


    selected_mask = mean_importances >= 0.01
    selected_features = X.columns[selected_mask].tolist()
    X_selected = X[selected_features]

    # Step 3: Hyperparameter tuning with RandomizedSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

    tscv = TimeSeriesSplit(n_splits=5)
    rf = RandomForestRegressor(random_state=2332)

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_grid,
        n_iter=20,
        cv=tscv,
        n_jobs=-1,
        scoring='neg_mean_absolute_error',
        random_state=2332
    )

    random_search.fit(X_selected, y)

    best_model = random_search.best_estimator_
    model_final = MultiOutputRegressor(best_model)
    model_final.fit(X_selected, y)
    dump(model_final, f"models_rf/rf_arima_sarima_model_{country}_{model_name}.joblib")
    test_aux = test.copy()
    prediction_columns = [f"prediction_{h+1}_weeks" for h in range(4)]
    preds = model_final.predict(X_test[selected_features])
    test_aux[prediction_columns] = preds

    # Evaluate predictions
    test_aux = test_aux.dropna()
    mae0, rmse0 = eval_metrics(test_aux["week_1_res"], test_aux["prediction_1_weeks"])
    mae1, rmse1  = eval_metrics(test_aux["week_2_res"], test_aux["prediction_2_weeks"])
    mae2, rmse2  = eval_metrics(test_aux["week_3_res"], test_aux["prediction_3_weeks"])
    mae3, rmse3 = eval_metrics(test_aux["week_4_res"], test_aux["prediction_4_weeks"])

    mape = pd.concat([
    mape,
    pd.DataFrame([
        [country, model_name, "1_week", mae0, rmse0],
        [country, model_name, "2_week", mae1, rmse1],
        [country, model_name, "3_week", mae2, rmse2],
        [country, model_name, "4_week", mae3, rmse3]
    ], columns=['location', 'model', 'prediction_window', 'mae', 'rmse'])
], ignore_index=True)
    return model_final, selected_features, test_aux,mape


In [62]:
train = create_features(train_2)
test = create_features(test_predictions_2)

In [63]:
train.columns

Index(['relative_humidity_2m', 'temperature_2m_max', 'temperature_2m_min',
       'covid', 'week_1_res', 'week_2_res', 'week_3_res', 'week_4_res', 'year',
       'lag_value_1', 'lag_humidity_1', 'lag_temp_max_1', 'lag_temp_min_1',
       'lag_value_2', 'lag_humidity_2', 'lag_temp_max_2', 'lag_temp_min_2',
       'lag_value_3', 'lag_humidity_3', 'lag_temp_max_3', 'lag_temp_min_3',
       'lag_value_4', 'lag_humidity_4', 'lag_temp_max_4', 'lag_temp_min_4',
       'month_sin', 'month_cos', 'week_sin', 'week_cos'],
      dtype='object')

In [64]:
model_final, selected_features, test_predictions_rf,mape = rf_variable_selection_and_hyperparam_tuning(train,test,country="AT", model_name="ARIMA")

In [66]:
mape

Unnamed: 0,location,model,prediction_window,mae,rmse
0,AT,ARIMA,1_week,1.847398,3.752878
1,AT,ARIMA,2_week,3.770039,7.769338
2,AT,ARIMA,3_week,4.859133,9.364203
3,AT,ARIMA,4_week,5.189211,9.626172
