In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join

plt.style.use('seaborn-white')

%matplotlib inline

from scipy.stats import gamma, poisson

import epyestim
import epyestim.covid19 as covid19
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE, r2_score
from xgboost import XGBRegressor, DMatrix, train
from sklearn.multioutput import MultiOutputRegressor

from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from pykalman import KalmanFilter

import plotly.express as px
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)


to_sum_KPIs = ['totale_casi_giornalieri', 'terapia_intensiva_giornalieri', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri']
covidKPIsPrecompute = ['%pos']+to_sum_KPIs
trafficKPIsPrecompute = ['Handover', 'Download vol.', 'Upload vol.', '#Users']

# sums regions such as trento + bolzano
def sumRegions(df, dateCol = 'Date', regionCol='Regione', cols = to_sum_KPIs, region1 = "P.A. Bolzano", region2 = "P.A. Trento", regionNew = "Trentino-Alto Adige"):
    dfRegion1, dfRegion2 = df.loc[df[regionCol] == region1], df.loc[df[regionCol] == region2]
    dfRegion1.set_index(dateCol, inplace=True)
    dfRegion2.set_index(dateCol, inplace=True)
    newVals = dfRegion1[to_sum_KPIs]+dfRegion2[to_sum_KPIs]
    newVals.reset_index(inplace=True)
    newVals['Regione'] = regionNew
    df = df.loc[(df[regionCol] != region1) & (df[regionCol] != region2)]
    return df.append(newVals)

# adds italy as cumulative over days
def addItalyData(df, cols):
    dfTemp = df.resample('D', on='Date').sum().reset_index()
    dfTemp['Regione']='Italia'
    dfTemp = dfTemp[cols]
    return pd.concat([df, dfTemp])

def fill_with_areas(dateRange, fig, is_train):
    if is_train:
        color = 'rgba(255, 0, 0, 0.2)'
    else:
        color = 'rgba(0, 0, 255, 0.2)'
    fig.add_shape(type="rect",
        yref="paper",
        x0=dateRange[0], y0=0,
        x1=dateRange[-1], y1=1,
        line=dict(
            width=0,
        ),
        fillcolor=color,
    )
    
    return fig

data_path = "/Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/"
by_region_path = "{}By_Region/".format(data_path)
saved = "{}saved/".format(data_path)
traffic_daily = "{}TS_1800_daily.pkl".format(saved)
region_traffic_daily = "{}all.pkl".format(saved)
covid = "{}covid/".format(data_path)
covid_daily = "{}covid_regioni.csv".format(covid)

# Preprocessing

## Handle temperature data

### Import

In [None]:
meteo_path = "{}meteo/".format(data_path)
dfs_filenames = [f for f in listdir(meteo_path) if isfile(join(meteo_path, f))]
dfs = []
path_temperature_predictions = "{}predictions/temperatures.csv".format(saved)
SAVE_TEMPERATURE = False
if SAVE_TEMPERATURE:
    for f in dfs_filenames:
        splits = f.split("_")
        if len(splits) == 2 and "." in splits[1]:
            filename = "{}{}".format(meteo_path, f)
            current_df = pd.read_csv(filename)
            region_name = splits[0]
            #if "rentino" not in region_name and "osta" not in region_name:
            month = splits[1][4:].split(".")[0]
            current_df['Regione'] = [r for r in regions_covid if region_name in r.lower()][0]
            current_df['month'] = int(month)
            current_df['year'] = int(2021 if "2021" in filename else 2020)
            dfs.append(current_df)
        df_temperature = pd.concat(dfs)
        df_temperature['Date'] = df_temperature.apply(lambda x: pd.to_datetime("{}/{}/{}".format(x.year, x.month, int(x.date.split(" ")[1]))), axis=1)
        df_temperature.set_index(['Date', 'Regione'], inplace=True)
        df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
        df_temperature.to_csv(path_temperature_predictions)
else:
    df_temperature = pd.read_csv(path_temperature_predictions)
    df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
    df_temperature.set_index(['Date', 'Regione'], inplace=True)
    
regions_temperature = df_temperature.index.get_level_values(1).unique()
regions = regions_temperature

## Handle COVID data

### Import

In [None]:
recompute_rt = False
import_covid  = False
path_covid = "{}covid.csv".format(saved)

if import_covid:
    df_covid = pd.read_csv(covid_daily)
    if "Regione" not in df_covid.columns:
        df_covid.rename(columns={'denominazione_regione': 'Regione'}, inplace=True)
        df_covid['tamponi_giornalieri'] = df_covid.groupby([
                        'Regione'])['tamponi'].diff()
        df_covid.loc[df_covid['tamponi_giornalieri'].isna() ,
                               'tamponi_giornalieri'] = df_covid['tamponi']


        df_covid['deceduti_giornalieri'] = df_covid.groupby([
                            'Regione'])['deceduti'].diff()
        df_covid.loc[df_covid['deceduti_giornalieri'].isna() ,
                               'deceduti_giornalieri'] = df_covid['deceduti']

        df_covid['terapia_intensiva_giornalieri'] = df_covid.groupby([
                            'Regione'])['terapia_intensiva'].diff()
        df_covid.loc[df_covid['terapia_intensiva_giornalieri'].isna() ,
                               'terapia_intensiva_giornalieri'] = df_covid['terapia_intensiva']

        df_covid['totale_casi_giornalieri'] = df_covid.groupby([
                            'Regione'])['totale_casi'].diff()
        df_covid.loc[df_covid['totale_casi_giornalieri'].isna() ,
                               'totale_casi_giornalieri'] = df_covid['totale_casi']
    covid_cols = ['Date', 'Regione', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri', 'totale_casi', 'deceduti', 'totale_casi_giornalieri', 'terapia_intensiva_giornalieri']

    df_covid.data = pd.to_datetime(df_covid.data)
    df_covid.rename(columns={'data': 'Date'}, inplace=True)
    df_covid = sumRegions(df_covid)
    regions_covid = df_covid['Regione'].unique()
    #df_covid = df_covid[df_covid['Regione'].isin(regions)].dropna()
    df_covid.to_csv(path_covid)
else:
    try:
        del df_covid
    except:
        print("No df covid")

### Compute Rt

In [None]:
dfs = []
path_covid_predictions="{}predictions/covid.pkl".format(saved)
if recompute_rt:
    for r in regions:
        print("REGIONE: {}".format(r))
        current_df = df_covid.loc[df_covid['Regione'] == r]
        current_df['Date'] = pd.to_datetime(current_df['Date']).dt.date
        current_df['DateIndex'] = current_df.loc[:, 'Date']
        current_df.set_index('DateIndex', inplace=True)
        #current_df = current_df.loc[current_df['nuovi_positivi'] > 0]
        current_df = current_df.loc[pd.to_datetime('2020/03/01'):pd.to_datetime('2021/01/31')]
        idxs = (current_df['nuovi_positivi'] < 0)# | (current_df.isna()) | (current_df['nuovi_positivi'] == np.inf) | (current_df['nuovi_positivi'] == -np.inf)
        if idxs.sum() > 0:
            current_df.loc[idxs, 'nuovi_positivi'] = np.nan
        current_df.fillna(method='ffill', inplace=True)
        current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #current_df[current_df.loc[:, 'nuovi_positivi']]
        #current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #
        current_df = current_df.drop_duplicates(keep='first')
        #print(current_df['nuovi_positivi'].shape, current_df['nuovi_positivi'].apply(lambda x: x < 0).sum())
        #current_df.dropna(subset=['totale_casi_giornalieri'], inplace=True)
        #print(current_df['totale_casi_giornalieri'].isna().sum())
        #print(current_df['totale_casi_giornalieri'].sum())
        r_t_series = covid19.r_covid(current_df['nuovi_positivi'])
        current_df = pd.merge(current_df, r_t_series, left_index=True, right_index=True)
        dfs.append(current_df)
    df_covid_predictions = pd.concat(dfs)
    del dfs
    df_covid_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_covid_predictions['%pos'] = (df_covid_predictions['nuovi_positivi']/df_covid_predictions['tamponi_giornalieri'])
    df_covid_predictions.to_pickle(path_covid_predictions)
else:
    df_covid_predictions = pd.read_pickle(path_covid_predictions)

## Handle traffic data

### Import

In [None]:
import_traffic = False
recompute_kalman = False
path_traffic = "{}traffic.csv".format(saved)
if import_traffic:
    df_traffic_daily = pd.read_pickle(region_traffic_daily)
    df_traffic_daily.loc[df_traffic_daily['Regione'] == "Emilia Romagna", "Regione"] = "Emilia-Romagna"
    df_traffic_predictions = df_traffic_daily.loc[df_traffic_daily['Regione'].isin(regions)]
    df_traffic_predictions = df_traffic_predictions.groupby('Regione').resample('D', on='Date').sum().reset_index()
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_traffic_predictions.to_csv(path_traffic)
else:
    df_traffic_predictions = pd.read_csv(path_traffic)
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)

### Smoothen with Kalman filter

In [None]:
# apply Kalman Filter to traffic prediction
dict_kalman = {}
path_traffic_predictions="{}predictions/traffic.pkl".format(saved)
if recompute_kalman:
    for trafficKPI in trafficKPIsPrecompute:
        #current_df_kalman = pd.DataFrame({"{}_smoothened".format(trafficKPI): []})
        dfs_current_kpi = []
        for region in regions:
            kf = KalmanFilter(transition_matrices = [1],
                      observation_matrices = [1],
                      initial_state_mean = 0,
                      initial_state_covariance = 1,
                      observation_covariance=1,
                      transition_covariance=.05)

            series = df_traffic_predictions.xs(region, level=1)[trafficKPI]

            kf = kf.em(series)
            (smoothened, smoothed_state_covariances) = kf.smooth(series)
            df_region_kpi = pd.DataFrame({"noisy": series})
            df_region_kpi['smooth'] = smoothened.squeeze()
            df_region_kpi['Regione'] = region
            df_region_kpi.reset_index(inplace=True)
            df_region_kpi.set_index(['Date', 'Regione'], inplace=True)
            dfs_current_kpi.append(df_region_kpi)

            dict_kalman["{}_{}".format(trafficKPI, region)] = kf

        df_traffic_predictions["{}_smoothened".format(trafficKPI)] = pd.concat(dfs_current_kpi)['smooth']
        df_traffic_predictions.to_pickle(path_traffic_predictions)
else:
    df_traffic_predictions = pd.read_pickle(path_traffic_predictions)

### Visualize Rt

In [None]:
plot_scatter = False
if plot_scatter:
    #x = it_time_varying_r.index
    num_cols = 4
    # 21//4 = 5
    regions = df_covid['Regione'].unique()
    fig = make_subplots(rows=6, cols=num_cols, subplot_titles=regions)
    fig.update_layout(height=1200, width=900, title_text="Stacked Subplots", legend = dict(font = dict(family = "Courier", size = 10, color = "black")))

    for i, region in enumerate([r for r in regions[0:5] if r != "Italia"]):
        row, col = (i // num_cols)+1, (i % num_cols)+1
        df_region = df_covid.loc[df_covid['Regione'] == region]
        df_region = df_region[100:]
        df_region.set_index('Date', inplace=True)
        df_region.fillna(method='ffill', inplace=True)
        # TODO problema sembra che per alcune regioni ci siano valori solo a partire da X
        #df_region.drop_duplicates(inplace=True)
        #print(df_region['totale_casi_giornalieri'])

        time_varying_r = covid19.r_covid(df_region['totale_casi_giornalieri'])
        x = time_varying_r.index
        y = time_varying_r['R_mean']
        y_upper = time_varying_r['Q0.025']
        y_lower = time_varying_r['Q0.975']



        fig.add_trace(
            go.Scatter(
                    x=x,
                    y=y,
                    line=dict(color='rgb(0,100,80)'),
                    mode='lines',
                    name="Indice Rt"
                ), row=row, col=col
        )

        fig.add_trace(
            go.Scatter(
                    name='Upper Bound',
                    x=x,
                    y=y_upper,
                    mode='lines',
                    marker=dict(color="#444"),
                    line=dict(width=0),
                    showlegend=False
                ), row=row, col=col
        )

        fig.add_trace(
            go.Scatter(
                    name='Lower Bound',
                    x=x,
                    y=y_lower,
                    marker=dict(color="#444"),
                    line=dict(width=0),
                    mode='lines',
                    fillcolor='rgba(68, 68, 68, 0.3)',
                    fill='tonexty',
                    showlegend=False
                ), row=row, col=col
        )
        fig.update_layout(
            title=region,
            legend_title="Legenda"
        )
    fig.show()

# Supervised

## Define KPIs

In [None]:
trafficKPIs = [col for col in df_traffic_predictions.columns if "smooth" in col]
covidKPIs = [col for col in df_covid_predictions.columns if "mean" in col]
temperatureKPIs = []#[col for col in df_temperature.columns if "min" in col]
targetCovid = ['R_mean']

In [None]:
del models_regions

In [None]:
# TODO per ogni intervallo salvarsi le feature che si usano per controllare cosa c'è che non va

# Forecasting

In [None]:
do_like_before = False
if do_like_before:
    params = {'objective': 'reg:squarederror'}
    models_regions = {r:{} for r in regions}

    def train_in_interval(interval, current_models_regions = None, init_from_scratch = False):
        if init_from_scratch:
            current_models_regions = {r:{} for r in regions}
        else:
            current_models_regions = models_regions
        for region in regions_to_train:
            print("{}: {} -> {}".format(region, min(interval), max(interval)))
            df_ts = df_train_prediction.loc[df_train_prediction.index.get_level_values(1)==region]
            df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
            df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
            print("TRAIN START for interval: {} -> {}".format(interval, df_ts.shape))
            for lag in lags_target:
                target = "target_{}".format(lag)
                targets_vals = df_ts[target].values
                xgtrain = DMatrix(data=df_ts[features].values, label=targets_vals)
                try:
                    current_models_regions[region][lag] = train(params, xgtrain, xgb_model=current_models_regions[region][lag])
                except:
                    model = train(params, xgtrain)
                    current_models_regions[region][lag] = model
        return current_models_regions

    #models_regions = train_in_interval((start_train_po, divider_po), init_from_scratch=True)
    models_regions = train_in_interval((start_train_po, end_train_so), init_from_scratch=True)
    train_dates_region_every_n = {}
    test_dates_region_every_n = {}
    train_dates_region = {}
    test_dates_region = {}
    df_ts_test_region = {}

    def test_in_interval(interval, current_models_regions, farsightness):
        print("TEST START")

        df_results = pd.DataFrame() # lag, region, prediction, target; index = date
        results_dict = []

        for region in regions_to_train:
            df_ts = df_test_prediction.loc[df_test_prediction.index.get_level_values(1)==region]
            df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
            df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
            first_date, last_date = df_ts.index.min(), df_ts.index.max()
            current_region_values = []
            current_region_features = []

            test_dates_region[region] = pd.date_range(first_date, last_date)
            df_ts_test_region[region] = df_ts
            test_dates = test_dates_region[region].unique()
            assert test_dates_region[region].shape[0]==len(test_dates), "Something wrong"
            print("{}: {} -> {}, {}".format(region, min(test_dates), max(test_dates), df_ts.shape))
            for i, t in enumerate(test_dates):
                features_covid = [col for col in features if "R_mean" in col]
                X_test_ts = df_ts.loc[t, features]
                X_test_ts.sort_index(inplace=True)

                for idx_lag, lag in enumerate(lags_target):
                    target_col = targets[idx_lag]
                    predictions = current_models_regions[region][lag].predict(DMatrix(X_test_ts.to_numpy().reshape(1, -1)))

                    X_test_fit, y_test_fit = X_test_ts, df_ts.loc[t, target_col]

                    #current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit}
                    # TODO devo aggiungere il delta o no??? Sembra troppo preciso
                    #-> inserire due date, una per visualizzazione, una per inserimento in predictions_so_far
                    #-> altrimenti torno al problema di prima

                    #current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit}
                    current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit}
                    current_features = X_test_fit.copy()
                    current_features['lag']=lag
                    current_region_features.append(current_features)
                    features_so_far = pd.DataFrame(current_region_features)
                    # TODO aggiungere feature da inserire dopo in training
                    results_dict.append(current_result)
                    current_region_values.append(current_result)
                    predictions_so_far = pd.DataFrame(current_region_values)
                    predictions_so_far = predictions_so_far.set_index(['date', 'lag'])
                    prediction_t = predictions_so_far.loc[(predictions_so_far.index.get_level_values(0) == pd.to_datetime(t))&(predictions_so_far.index.get_level_values(1) == lag)]

                    features_so_far = features_so_far.reset_index().set_index(['index', 'lag'])
                    features_t = features_so_far.loc[(features_so_far.index.get_level_values(0) == pd.to_datetime(t))&(features_so_far.index.get_level_values(1) == lag)]

                    if i < len(test_dates)-1 and prediction_t.shape[0] > 0:
                        xgtrain = DMatrix(data=features_t.to_numpy().reshape(1, -1), label=prediction_t['target'])
                        #xgtrain = DMatrix(data=X_test_fit.to_numpy().reshape(1, -1), label=y_test_fit.to_numpy().reshape(1, -1))
                        #current_models_regions[region][lag] = train(params, xgtrain, xgb_model=current_models_regions[region][lag])
                        #current_models_regions[region][lag] = train(params, xgtrain, xgb_model=current_models_regions[region][lag])
                        current_models_regions[region][lag] = train(params, xgtrain, xgb_model=current_models_regions[region][lag])
                        # 1) retrain on entire
                        # 2) retrain on entire and add a window -> 05/12, lag=1 -> (01/10, 4/12); (11/10, 14/12)
                        # 3) LSTM
        df_results = pd.DataFrame(results_dict)
        df_results = df_results.dropna()
        df_results.set_index(['date', 'region', 'lag'], inplace=True)
        df_results['error']=(df_results['prediction']-df_results['target']).abs()
        df_results['error_2'] = df_results['error']**2
        return current_models_regions, df_results

    min_date, max_date = df_test_prediction.index.get_level_values(0).min(), df_test_prediction.index.get_level_values(0).max()
    models_regions, df_results = test_in_interval((min_date, max_date), models_regions, farsightness)


    def build_df_results_groupped(df_results):
        return np.sqrt(df_results.groupby(level=['region', 'lag'])['error_2'].mean()).reset_index()

    df_results_mean = build_df_results_groupped(df_results)

In [None]:
df_test_prediction

In [None]:
# divider_po, divider_so = pd.to_datetime('2020-07-01'), pd.to_datetime('2020-10-01')
start_train_po, end_train_so = pd.to_datetime('2020-03-01'), pd.to_datetime('2020-11-01')
start_po = start_train_po
ranges_train_PO = [pd.date_range(start_po, divider_po)]
ranges_train_SO = [pd.date_range(divider_so, end_train_so)]
last_date = min(df_traffic_predictions.index.get_level_values(0).max(), df_covid_predictions.index.get_level_values(0).max(), df_temperature.index.get_level_values(0).max())
ranges_so = [pd.date_range(divider_so, last_date)]
regions_to_train = ["Lombardia"]#regions

min_farsightness = 1
farsightness = 60
delta_features = 14
lags = range(delta_features)
lags_target = range(min_farsightness, farsightness, 4)

#ranges_test_SO = [pd.date_range(end_train_so-pd.Timedelta(days=delta_features-1), last_date)]
ranges_test_SO = [pd.date_range(end_train_so-pd.Timedelta(days=delta_features), last_date)]

def build_df_prediction(range_dates):
    all_dfs = []
    # prima ondata
    for region in regions_to_train:
        # filter ts by region
        df_traffic_ts = df_traffic_predictions.loc[(df_traffic_predictions.index.get_level_values(1)==region), trafficKPIs].copy()
        df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region, list(set(covidKPIs+targetCovid))].copy()
        df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region, temperatureKPIs].copy()

        df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
        df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))

        df_covid_ts.reset_index(inplace=True)
        df_temperature_ts.reset_index(inplace=True)
        df_traffic_ts.reset_index(inplace=True)

        df_covid_ts = df_covid_ts.set_index('Date')
        df_temperature_ts = df_temperature_ts.set_index('Date')
        df_traffic_ts = df_traffic_ts.set_index('Date')

        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = targetCovid[0]
        df_target_ts = df_covid_ts.copy()

        train_dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)

        train_dates = []
        if len(temperatureKPIs) > 0:
            train_dates_intersection = train_dates_intersection.isin(df_temperature_ts.index)
        for date_val in train_dates_intersection:
            if any(date_val in x for x in range_dates):
                train_dates.append(date_val)

        train_dates = pd.to_datetime(train_dates)

        df_covid_ts, df_traffic_ts, df_temperature_ts = df_covid_ts.loc[df_covid_ts.index.isin(train_dates)], df_traffic_ts.loc[df_traffic_ts.index.isin(train_dates)], df_temperature_ts.loc[df_temperature_ts.index.isin(train_dates)]

        for lag in lags_target:
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_target_ts.shift(-1*lag)[target_col]

        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                #print(feature, df_traffic_ts.shift(lag_shift).loc[:, col])
                df_ts[feature] = df_traffic_ts.copy().shift(lag_shift).loc[:, col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.copy().shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.copy().shift(lag_shift)[col]
                features.append(feature)

        df_ts = df_ts[targets+features]
        df_ts.dropna(subset=features, inplace=True)
        df_ts['Regione'] = region
        df_ts = df_ts.reset_index().set_index(['Date', 'Regione'])
        all_dfs.append(df_ts.copy())
    return pd.concat(all_dfs), targets, features
(df_train_prediction_PO, targets, features), (df_train_prediction_SO, _, _) = build_df_prediction(ranges_train_PO), build_df_prediction(ranges_train_SO)
(df_test_prediction, _, _) = build_df_prediction(ranges_test_SO)
TRAIN_ALSO_PO = True
if TRAIN_ALSO_PO:
    df_train_prediction = pd.concat([df_train_prediction_PO, df_train_prediction_SO])
else:
    df_train_prediction = df_train_prediction_SO

In [None]:
params = {'objective': 'reg:squarederror'}
models_regions = {r:{} for r in regions}

def train_in_interval(interval):
    for region in regions_to_train:
        print("{}: {} -> {}".format(region, min(interval), max(interval)))
        df_ts = df_train_prediction.loc[df_train_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        for lag in lags_target:
            target = "target_{}".format(lag)
            df_ts_lag = df_ts.copy().drop(columns=[col for col in targets if (col not in features) and (col != target)]).dropna()
            models_regions[region][lag] = XGBRegressor()
            # print("DBG: {}, {}".format(df_ts[features].shape, df_ts[target].shape))
            models_regions[region][lag].fit(df_ts_lag[features].values, df_ts_lag[target])
    return models_regions

#models_regions = train_in_interval((start_train_po, divider_po), init_from_scratch=True)
models_regions = train_in_interval((start_train_po, end_train_so))

In [None]:
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}

def test_in_interval(interval):
    print("TEST START")
                
    df_results = pd.DataFrame() # lag, region, prediction, target; index = date
    results_dict = []

    for region in regions_to_train:
        df_ts = df_test_prediction.loc[df_test_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        first_date, last_date = df_ts.index.min(), df_ts.index.max()
        current_region_values = []
        
        test_dates_region[region] = pd.date_range(first_date, last_date)
        df_ts_test_region[region] = df_ts
        test_dates = test_dates_region[region].unique()
        assert test_dates_region[region].shape[0]==len(test_dates), "Something wrong"
        for idx_lag, lag in enumerate(lags_target):
            target_col = targets[idx_lag]
            walk_forward_df = df_train_prediction.copy().drop(columns=[col for col in targets if (col not in features) and (col != target_col)]).dropna()
            current_df_ts = df_ts[features+[target_col]].copy().dropna()
            test_dates = current_df_ts.index
            print("{}, lag = {}: {} -> {}, {}".format(region, lag, min(test_dates), max(test_dates), current_df_ts.shape))
            for i, t in enumerate(test_dates):
                features_covid = [col for col in features if "R_mean" in col]
                current_df_ts = df_ts.loc[t:t+datetime.timedelta(days=0)]
                
                X_test_ts = current_df_ts[features]
                #X_test_ts.sort_index(inplace=True)
                #print("PREDICT: {}".format(X_test_ts))

                predictions = models_regions[region][lag].predict(X_test_ts.values)

                X_test_fit, y_test_fit = X_test_ts, current_df_ts[target_col].values

                #current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit}
                current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit[0]}
                
                results_dict.append(current_result)
                
                walk_forward_df = walk_forward_df.append(current_df_ts)
                models_regions[region][lag].fit(walk_forward_df[features].values, walk_forward_df[target_col])
                # 1) retrain on entire
                # 2) retrain on entire and add a window -> 05/12, lag=1 -> (01/10, 4/12); (11/10, 14/12)
                # 3) LSTM
    df_results = pd.DataFrame(results_dict)
    df_results = df_results.dropna()
    df_results.set_index(['date', 'region', 'lag'], inplace=True)
    df_results['error']=(df_results['prediction']-df_results['target']).abs()
    df_results['error_2'] = df_results['error']**2
    return models_regions, df_results

min_date, max_date = df_test_prediction.index.get_level_values(0).min(), df_test_prediction.index.get_level_values(0).max()
models_regions, df_results = test_in_interval((min_date, max_date))


def build_df_results_groupped(df_results):
    return np.sqrt(df_results.groupby(level=['region', 'lag'])['error_2'].mean()).reset_index()

df_results_mean = build_df_results_groupped(df_results)

In [None]:
name = "XGBoost"
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app_results = JupyterDash(name, external_stylesheets=external_stylesheets)

app_results.layout = html.Div([
html.Label(
    [
        "Regione",
        dcc.Dropdown(id="regions",
                     options=[{"label": x, "value": x} for x in regions_to_train],
                    value=[regions_to_train[0]],
                    multi=True,
                    clearable=True)
    ]),

html.Label(
    [
        "Lag",
        dcc.Dropdown(id="lags",
                     options=[{"label": x, "value": x} for x in lags_target],
                    value=lags_target[-1],
                    clearable=True)
    ]),

html.Label(
    [
        "Tipologia plot",
        dcc.Dropdown(id="plot_type",
                     options=[{"label": x, "value": x} for x in ["Timeseries", "Errore assoluto"]],
                    value="Timeseries",
                    clearable=False)
    ]),
html.Div(dcc.Graph(id=name))])

@app_results.callback(
Output(name, "figure"), 
[Input("regions", "value"), Input("lags", "value"), Input("plot_type", "value")])
def display_rf_results(regions, lag, plot_type):

    if isinstance(regions, str):
        regions = [regions]

    if regions is None:
        regions = [regions_to_train[0]]

    is_timeseries, is_timeseries_error = plot_type == "Timeseries", plot_type == "Errore assoluto"

    R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
    y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
    fig = make_subplots(specs=[[{"secondary_y": False}]])
    #df_plot = df_results.reset_index()
    df_plot = df_results.loc[(df_results.index.get_level_values('region').isin(regions))&(df_results.index.get_level_values('lag')==lag)]
    
    for i, r in enumerate(regions):

        current_df_results = df_plot.loc[(slice(None), r, lag)]
        
        if is_timeseries_error:
            fig.add_trace(
                go.Scatter(
                    x=current_df_results.index,
                    y=current_df_results['error'],
                    name="Y - {}".format(r),
                    marker=dict(
                        color=R_colors[i]
                    ),
                    showlegend=False,
                )
            )
        else:
            fig.add_trace(
                    go.Scatter(
                        x=current_df_results.index,
                        y=current_df_results['prediction'],
                        name="Prediction - {}".format(r),
                        marker=dict(
                            color=y_hat_colors[i]
                        )
                    )
                )
            fig.add_trace(
                    go.Scatter(
                        x=current_df_results.index,
                        y=current_df_results['target'],
                        name="Target - {}".format(r),
                        marker=dict(
                            color=R_colors[i]
                        )
                    )
                )

    '''
    # Add figure title
    fig.update_layout(
        title_text="{} vs {}".format(trafficKPI, covidKPI)
    )
    
    for dateRange in ranges_train:
        fig = fill_with_areas(dateRange, fig, True)
    '''
    #for dateRange in ranges_test:
    #    fig = fill_with_areas(dateRange, fig, False)

    return fig

#app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
app_results.run_server(mode='inline', port=46004) # debug=True, use_reloader=False

In [None]:
df_results['error_2'] = df_results['error']**2
df_results_mean = np.sqrt(df_results.groupby(level=['region', 'lag'])['error_2'].mean()).reset_index()

col='error_2'
fig = px.box(df_results_mean, x="lag", y=col)

means = df_results_mean.groupby(by=["lag"]).mean()
medians = df_results_mean.groupby(by=["lag"]).median()

fig.add_trace(go.Scatter(x=means.index, y=means[col],
                mode='lines',
                name='Mean'))

# Add figure title
fig.update_layout(
    title_text="Boxplot errors",
    yaxis=dict(
        range=[0, df_results_mean.max()]
    )
)

fig.show()

In [None]:

# train during PO
train_also_PO = True
if train_also_PO:
    print("PRIMA ONDATA")
    for region in regions_to_train:
        df_train_po = df_train_prediction.loc[df_train_prediction_PO.index.get_level_values(1)==region]
        df_train_po = df_train_po.set_index('Date')
        
        #df_traffic_ts = df_traffic_ts.loc[(df_traffic_ts.index >= divider_po)&(df_traffic_ts.index < divider_so)]
        #df_covid_ts = df_covid_ts.loc[(df_covid_ts.index >= divider_po)&(df_covid_ts.index < divider_so)]
        #df_temperature_ts = df_temperature_ts.loc[(df_temperature_ts.index >= divider_po)&(df_temperature_ts.index < divider_so)]
        #print("1", df_traffic_ts)
        # create dataframe with last values
        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = "R_mean"
        df_target_ts = df_covid_ts.copy()
        
        # regions may have different dates for each dataset; for this reason I select them before by taking ...
        #traffic_dates, covid_dates, temperature_dates = df_traffic_ts.index, df_covid_ts.index, df_temperature_ts.index
        #train_dates_intersection = traffic_dates.isin(covid_dates)
        train_dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)
        #df_traffic_ts = df_traffic_ts.loc[traffic_dates]
        train_dates = []
        if len(temperatureKPIs) > 0:
            train_dates_intersection = train_dates_intersection.isin(df_temperature_ts.index)
        for date_val in train_dates_intersection:
            if any(date_val in x for x in ranges_train_PO):
                train_dates.append(date_val)
                
        train_dates = pd.to_datetime(train_dates)
        
        df_covid_ts, df_traffic_ts, df_temperature_ts = df_covid_ts.loc[df_covid_ts.index.isin(train_dates)], df_traffic_ts.loc[df_traffic_ts.index.isin(train_dates)], df_temperature_ts.loc[df_temperature_ts.index.isin(train_dates)]
        
        for lag in lags_target:
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_target_ts.shift(-1*lag)[target_col]
        
        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                #print(feature, df_traffic_ts.shift(lag_shift).loc[:, col])
                df_ts[feature] = df_traffic_ts.shift(lag_shift).loc[:, col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
                features.append(feature)
        
        #print("PROVA1", df_ts.head())

        df_ts = df_ts[targets+features]
        df_ts.dropna(inplace=True)
        #df_ts = df_ts.reset_index(level='Regione')
        print("Intervalli regione {}: {} -> {}".format(region, df_ts.index.min(), df_ts.index.max()))
        test_dates, train_dates = [], []
        
        for lag in lags_target:
            target = "target_{}".format(lag)
            targets_vals = df_ts[target].values
            
            xgtrain = DMatrix(df_ts[features].values, targets_vals)
            model = train(params, xgtrain)
            models_regions[region][lag] = model

In [None]:
# then during SO
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}
all_dfs = []
#ranges_train_ts = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]
print("SECONDA ONDATA")

for region in regions_to_train:
    df_traffic_ts = df_traffic_predictions.loc[df_traffic_predictions.index.get_level_values(1)==region, trafficKPIs].copy()
    df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region, covidKPIs].copy()
    df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region, temperatureKPIs].copy()

    df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
    df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))

    df_covid_ts.reset_index(inplace=True)
    df_temperature_ts.reset_index(inplace=True)
    df_traffic_ts.reset_index(inplace=True)

    df_covid_ts = df_covid_ts.set_index('Date')
    df_temperature_ts = df_temperature_ts.set_index('Date')
    df_traffic_ts = df_traffic_ts.set_index('Date')
    
    #df_traffic_ts = df_traffic_ts.loc[(df_traffic_ts.index >= divider_so)]
    #df_covid_ts = df_covid_ts.loc[(df_covid_ts.index >= divider_so)]
    #df_temperature_ts = df_temperature_ts.loc[(df_temperature_ts.index >= divider_so)]

    # create dataframe with last values
    df_ts = pd.DataFrame()
    features = []
    targets = []
    target_col = "R_mean"
    df_target_ts = df_covid_ts.copy()

    # regions may have different dates for each dataset; for this reason I select them before by taking ...
    #traffic_dates, covid_dates, temperature_dates = df_traffic_ts.index, df_covid_ts.index, df_temperature_ts.index
    #train_dates_intersection = traffic_dates.isin(covid_dates)
    train_dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)
    df_traffic_ts = df_traffic_ts.loc[traffic_dates]
    so_dates = []
    dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)
    if len(temperatureKPIs) > 0:
        dates_intersection = dates_intersection.isin(df_temperature_ts.index)
    for date_val in dates_intersection:
        if any(date_val in x for x in ranges_so):
            so_dates.append(date_val)

    so_dates = pd.to_datetime(so_dates)

    df_covid_ts, df_traffic_ts, df_temperature_ts = df_covid_ts.loc[df_covid_ts.index.isin(so_dates)], df_traffic_ts.loc[df_traffic_ts.index.isin(so_dates)], df_temperature_ts.loc[df_temperature_ts.index.isin(so_dates)]

    for lag in lags_target:
        target = "target_{}".format(lag)
        targets.append(target)
        df_ts[target] = df_target_ts.shift(-1*lag)[target_col]
    for lag in lags:
        lag_shift = lag+1
        for col in trafficKPIs:
            feature = "{}_{}".format(col, lag_shift)
            #print(feature, df_traffic_ts.shift(lag_shift).loc[:, col])
            df_ts[feature] = df_traffic_ts.shift(lag_shift).loc[:, col]
            features.append(feature)
        for col in covidKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
            features.append(feature)
        for col in temperatureKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
            features.append(feature)
            
    df_ts = df_ts[targets+features]
    df_ts.dropna(inplace=True)
    all_dfs.append(df_ts.copy())
    #df_ts = df_ts.reset_index(level='Regione')
    print("Intervalli regione {}: {} -> {}".format(region, df_ts.index.min(), df_ts.index.max()))
    test_dates, train_dates = [], []
    # filter ts by region
    
    for date_val in df_ts.index.unique():
        if any(date_val in x for x in ranges_train_SO):
            train_dates.append(date_val)
        else:
            test_dates.append(date_val)
            
    df_ts_train, df_ts_test = df_ts.loc[train_dates], df_ts.loc[test_dates]
    
    test_dates_every_n = pd.date_range(start=test_dates[0], end=test_dates[-1], freq="{}D".format(farsightness)).to_pydatetime()
    test_dates_region_every_n[region] = test_dates_every_n
    train_dates_region[region], test_dates_region[region] = train_dates, test_dates
    df_ts_test_region[region] = df_ts_test
    
    for lag in lags_target:
        targets_vals = df_ts[target].values
        xgtrain = DMatrix(df_ts_train[features].values, targets_vals)
        try:
            models_regions[region][lag] = train(params, xgtrain, xgb_model=models_regions[region])
        except:
            model = train(params, xgtrain)
            models_regions[region][lag] = model

In [None]:
df_traffic_predictions

In [None]:
df_results = pd.DataFrame() # lag, region, prediction, target; index = date
results_dict = []

for r in regions_to_train:
    for lag in lags_target:
        test_predictions[r][lag] = np.array([])
        test_targets[r][lag] = np.array([])

for region in regions_to_train:
    test_dates = test_dates_region[region]
    df_ts_test = df_ts_test_region[region]
    print("{}: {}".format(region, df_ts_test.shape))
    for i, t in enumerate(test_dates):
        X_test_ts = df_ts_test.loc[t, features]
        X_test_ts.sort_index(inplace=True)
        
        for lag in lags_target:
            predictions = models_regions[region][lag].predict(DMatrix(X_test_ts.to_numpy().reshape(1, -1)))
            
            #test_predictions[r][lag] = np.append(test_predictions[r][lag], predictions)
            
            X_test_fit, y_test_fit = X_test_ts, df_ts_test.loc[t, targets[lag]]
            #test_targets[r][lag] = np.append(test_targets[r][lag], y_test_fit)
            results_dict.append({"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit})

            if i < len(test_dates)-1:
                xgtrain = DMatrix(X_test_fit.to_numpy().reshape(1, -1), y_test_fit.reshape(1, -1))
                models_regions[region][lag] = train(params, xgtrain, xgb_model=models_regions[region][lag])

In [None]:
name = "XGBoost"
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app_results = JupyterDash(name, external_stylesheets=external_stylesheets)

app_results.layout = html.Div([
html.Label(
    [
        "Regione",
        dcc.Dropdown(id="regions",
                     options=[{"label": x, "value": x} for x in regions_to_train],
                    value=["Lombardia"],
                    multi=True,
                    clearable=True)
    ]),

html.Label(
    [
        "Lag",
        dcc.Dropdown(id="lags",
                     options=[{"label": x, "value": x} for x in lags_target],
                    value=lags_target[-1],
                    clearable=True)
    ]),

html.Label(
    [
        "Tipologia plot",
        dcc.Dropdown(id="plot_type",
                     options=[{"label": x, "value": x} for x in ["Timeseries", "Errore assoluto"]],
                    value="Timeseries",
                    clearable=False)
    ]),
html.Div(dcc.Graph(id=name))])

@app_results.callback(
Output(name, "figure"), 
[Input("regions", "value"), Input("lags", "value"), Input("plot_type", "value")])
def display_rf_results(regions, lag, plot_type):

    if isinstance(regions, str):
        regions = [regions]

    if regions is None:
        regions = ['Lombardia']

    is_timeseries, is_timeseries_error = plot_type == "Timeseries", plot_type == "Errore assoluto"

    R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
    y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
    fig = make_subplots(specs=[[{"secondary_y": False}]])
    #df_plot = df_results.reset_index()
    df_plot = df_results.loc[(df_results.index.get_level_values('region').isin(regions))&(df_results.index.get_level_values('lag')==lag)]
    
    for i, r in enumerate(regions):

        current_df_results = df_plot.loc[(slice(None), r, lag)]
        
        if is_timeseries_error:
            fig.add_trace(
                go.Scatter(
                    x=current_df_results.index,
                    y=current_df_results['error'],
                    name="Y - {}".format(r),
                    marker=dict(
                        color=R_colors[i]
                    ),
                    showlegend=False,
                )
            )
        else:
            fig.add_trace(
                    go.Scatter(
                        x=current_df_results.index,
                        y=current_df_results['prediction'],
                        name="Prediction - {}".format(r),
                        marker=dict(
                            color=y_hat_colors[i]
                        )
                    )
                )
            fig.add_trace(
                    go.Scatter(
                        x=current_df_results.index,
                        y=current_df_results['target'],
                        name="Target - {}".format(r),
                        marker=dict(
                            color=R_colors[i]
                        )
                    )
                )

    '''
    # Add figure title
    fig.update_layout(
        title_text="{} vs {}".format(trafficKPI, covidKPI)
    )
    
    for dateRange in ranges_train:
        fig = fill_with_areas(dateRange, fig, True)
    '''
    #for dateRange in ranges_test:
    #    fig = fill_with_areas(dateRange, fig, False)

    return fig

#app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
app_results.run_server(mode='inline', port=46004) # debug=True, use_reloader=False

In [None]:
df_results['error_2'] = df_results['error']**2
df_results_mean = np.sqrt(df_results.groupby(level=['region', 'lag'])['error_2'].mean()).reset_index()

col='error_2'
fig = px.box(df_results_mean, x="lag", y=col)

means = df_results_mean.groupby(by=["lag"]).mean()
medians = df_results_mean.groupby(by=["lag"]).median()

fig.add_trace(go.Scatter(x=means.index, y=means[col],
                mode='lines',
                name='Mean'))

# Add figure title
fig.update_layout(
    title_text="Boxplot errors",
    yaxis=dict(
        range=[0, df_results_mean.max()]
    )
)

fig.show()

# TODO qui devo reintegrare nella parte di supervised

In [None]:
farsightness = 7
lags_range_covid = range(farsightness, 12, 2)#[7, 9, 11]#range(10, 20, 5)
lags_range_traffic = range(farsightness, 12, 2)#range(10, 20, 5)
lags_range_temperature = []
covisPredictionKPIs = ['R_mean']
tempPredictionKPIs = []#['prec', 'tmin']
# Feature selection: 
# - lags (CV)                                -> (tuning) -> first, compare correlation; then select lags range that give best results in random
# - covid: R_mean, terapie_intensive;        -> add columns properly -> ok
# - polynomial                               -> for linear regression model, add features ^2, ^3 ...
# - differentiate lags for type of feature   -> covid: (10, 20); temperature: (10, 16); -> (tuning)
# - rolling avg                              -> target + features; -> not improvement
trafficPredictionKPIs = [t for t in trafficKPIs if "smooth" in t]
df = pd.DataFrame()
features_covid, features_traffic, features_temp = [], [], []

rolling_avg_k = 1
df_traffic_predictions_roll = df_traffic_predictions#[trafficPredictionKPIs].rolling(rolling_avg_k).mean()
df_temperature_roll = df_temperature#[tempPredictionKPIs].rolling(rolling_avg_k).mean()
df_covid_roll = df_covid_predictions#[covisPredictionKPIs].rolling(rolling_avg_k).mean()
    
for lag in lags_range_covid:
    df_covid_shift = df_covid_roll.sort_index(level=[1, 0]).groupby(level=1).shift(lag)
    for covidKPI in covisPredictionKPIs:
        kpi = '{}_{}'.format(covidKPI, lag)
        df[kpi] = df_covid_shift[covidKPI]
        features_covid.append(kpi)
        
for lag in lags_range_traffic:
    df_traffic_shift = df_traffic_predictions_roll.sort_index(level=[1, 0]).groupby(level=1).shift(lag)
    for trafficKPI in trafficPredictionKPIs:
        kpi = '{}_{}'.format(trafficKPI, lag)
        df[kpi] = df_traffic_shift[trafficKPI]
        features_traffic.append(kpi)
        
for lag in lags_range_temperature:
    df_temperature_shift = df_temperature_roll.sort_index(level=[1, 0]).groupby(level=1).shift(lag)
    for meteoKPI in tempPredictionKPIs:
        kpi = '{}_{}'.format(meteoKPI, lag)
        df[kpi] = df_temperature_shift[meteoKPI]
        features_temp.append(kpi)

In [None]:
df = pd.merge(df, df_covid_roll, left_index=True, right_index=True)
#df.fillna(method='ffill', inplace=True)
features, targets = features_covid+features_traffic+features_temp, ['R_mean']

In [None]:
df.shape

In [None]:
columns_to_select = features+targets
df = df[columns_to_select]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.loc[:, [col for col in df.columns if "over" in col]].head(14)

In [None]:
df.loc[:, [col for col in df.columns if "mean" in col]].head(14)

In [None]:
divider_po, divider_so = pd.to_datetime('2020-08-01'), pd.to_datetime('2020-10-01')
ranges_train = [pd.date_range(pd.to_datetime('2020-03-01'), pd.to_datetime('2020-05-01')), pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]
#ranges_train = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-12-31'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]
#divider_train_start, divider_train_end = pd.to_datetime('2020-05-01'), pd.to_datetime('2020-11-01')#divider_so, pd.to_datetime('2020-12-10')
#ranges_train = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-12-30'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]

only_so = False
if only_so:
    df = df[(df.index.get_level_values(0) >= divider_so)]
else:
    df = df[(df.index.get_level_values(0) < divider_po) | (df.index.get_level_values(0) >= divider_so)]

In [None]:
df.shape

In [None]:
X, y = df[features], df[targets]
train_dates, test_dates = [], []
for date_val in X.index.get_level_values(0).unique():
    if any(date_val in x for x in ranges_train):
        train_dates.append(date_val)
    else:
        test_dates.append(date_val)

In [None]:
do_splits = True
X, y = df[features], df[targets]

dates, regions_df = X.reset_index()['Date'], X.reset_index()['Regione']
if do_splits:
    is_rf = True
    is_gbr = False
    is_gaussian = False
    is_linear = False
    is_mlp = False

    all_regions_prediction = regions
    dict_regions = {region: i for i, region in enumerate(all_regions_prediction)}

    splits = {'Random': {}, 'Per periodo': {}, 'Per regione': {}}

    splits['Random']['X_train'], splits['Random']['X_test'], splits['Random']['y_train'], splits['Random']['y_test'] = \
                train_test_split(X, y, test_size = 0.3, random_state = 123)   
    splits['Random']['X'] = X.copy()
    
    idxs_train, idxs_test = X.index.get_level_values(0).intersection(pd.Series(train_dates)), X.index.get_level_values(0).intersection(pd.Series(test_dates))
    splits['Per periodo']['X_train'], splits['Per periodo']['X_test'], splits['Per periodo']['y_train'], \
        splits['Per periodo']['y_test'] = X.loc[idxs_train], X.loc[idxs_test], y.loc[idxs_train], y.loc[idxs_test]
   
    splits['Per periodo']['X'] = X.copy()
    splits['Per periodo']['X_train']['Regione'], splits['Per periodo']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per periodo']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per periodo']['X_test'].index.get_level_values(1).to_list()]
    
    splits['Per periodo']['X']['Regione']=[dict_regions[r] for r in X.index.get_level_values(1).to_list()]

    regions_train, regions_test = train_test_split(all_regions_prediction, test_size = 0.3, random_state = 1)
    idx_train, idx_test = X.index.get_level_values(1).isin(regions_train), X.index.get_level_values(1).isin(regions_test)
    splits['Per regione']['X_train'], splits['Per regione']['X_test'], splits['Per regione']['y_train'], splits['Per regione']['y_test'] = \
        X[idx_train], X[idx_test], y[idx_train], y[idx_test]
    #splits['Per regione']['X_train']['Regione'], splits['Per regione']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per regione']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per regione']['X_test'].index.get_level_values(1).to_list()]
    splits['Per regione']['X'] = X.copy()
    #splits['Per regione']['X']['Regione'] = [dict_regions[r] for r in X.index.get_level_values(1).to_list()]

    for key in splits.keys():
        if is_rf:
            model = RandomForestRegressor(n_estimators=200)
        elif is_gbr:
            model = XGBRegressor()
        elif is_gaussian:
            model = GaussianProcessRegressor()
        elif is_mlp:
            model = MLPRegressor(hidden_layer_sizes=(64, 32))
        else:
            model = LinearRegression()

        X_train, X_test, y_train, y_test = splits[key]['X_train'], splits[key]['X_test'], splits[key]['y_train'], splits[key]['y_test']
        X = splits[key]['X']
        #print(X_train.shape, X_test.shape, X.shape)
        if key == 'Per periodo':
            print("X: {}".format(X_train.head()))
            print("y: {}".format(y_train.head()))
        model.fit(X_train, y_train)
        predictions = model.predict(X).squeeze()
        predictions_test = model.predict(X_test)
        rmse = np.sqrt(MSE(y_test, predictions_test))
        splits[key]['rmse'] = rmse
        splits[key]['r2'] = r2_score(y_test, predictions_test)
        splits[key]['model'] = model

        df_preds = pd.DataFrame({'y_hat': predictions, 'Date': dates, 'Regione': regions_df})
        df_preds = df_preds.set_index(['Date', 'Regione'])

        splits[key]['results'] = pd.merge(y, df_preds, left_index=True, right_index=True)

In [None]:
print(regions_test)

In [None]:
if do_splits:
    name = "Random Forest"
    external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
    app_timeseries = JupyterDash(name, external_stylesheets=external_stylesheets)

    regions_select = list(all_regions_prediction)
    regions_select.append("Tutti")

    app_timeseries.layout = html.Div([
    html.Label(
        [
            "Regione",
            dcc.Dropdown(id="regions",
                         options=[{"label": x, "value": x} for x in regions_select],
                        value="Lombardia",
                        multi=True,
                        clearable=True)
        ],
    ),
    html.Label(
        [
            "Tipologia plot",
            dcc.Dropdown(id="plot_type",
                         options=[{"label": x, "value": x} for x in ["Timeseries", "Errore assoluto", "RMSE", "R2"]],
                        value="Timeseries",
                        clearable=False)
        ]),
    html.Label(
        [
            "Tipologia split",
            dcc.Dropdown(id="split_type",
                         options=[{"label": x, "value": x} for x in list(splits.keys())],
                        value=list(splits.keys())[1],
                        clearable=False)
        ]),

        html.Label(
                    ["Rolling average window ",
                    html.Br(),
                    dcc.Input(
                        id='rolling_avg',
                        type='number',
                        value=1
                    )]
        ),
    html.Div(dcc.Graph(id=name))])

    @app_timeseries.callback(
    Output(name, "figure"), 
    [Input("regions", "value"), Input("plot_type", "value"), Input("split_type", "value"), Input("rolling_avg", "value")])
    def display_rf_results(regions, plot_type, split_type, roll_avg):

        if isinstance(regions, str):
            regions = [regions]

        if regions is None:
            regions = ['Tutti']

        if plot_type is None:
            plot_type = "Timeseries"

        if roll_avg is None or roll_avg <= 0:
            roll_avg = 1

        if "Tutti" in regions:
            regions = all_regions_prediction
        
        if isinstance(splits[split_type]['results'], dict):
            #print("MEDIA")
            df_results = splits[split_type]['results']['mean']
        else:
            #print("NON MEDIA")
            df_results = splits[split_type]['results']

        is_r2, is_rmse, is_timeseries, is_timeseries_error = plot_type == "R2", plot_type == "RMSE", plot_type == "Timeseries", plot_type == "Errore assoluto"

        R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
        y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
        fig = make_subplots(specs=[[{"secondary_y": False}]])

        for i, r in enumerate(regions):

            current_df_results = df_results[df_results.index.get_level_values(1) == r]
            current_df_results_so = current_df_results[current_df_results.index.get_level_values(0) >= divider_so]
            x_vals_so = current_df_results_so.index.get_level_values(0)

            if current_df_results_so.shape[0] > 0:
                error_so = np.abs(current_df_results_so['R_mean']-current_df_results_so['y_hat'])
                
                if is_timeseries_error:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=error_so,
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    # TODO draw lines
                    #fig.update_yaxes(range=[0, 1])
                elif is_rmse or is_r2:
                    if is_rmse:
                        constant_so = np.sqrt(MSE(current_df_results_so['R_mean'], current_df_results_so['y_hat']))
                    else:
                        constant_so = r2_score(current_df_results_so['R_mean'], current_df_results_so['y_hat'])
                    
                    fig.add_trace(
                        go.Scatter(
                                x=x_vals_so,
                                y=np.repeat(constant_so, len(x_vals_so)),
                                name="Y - {}".format(r),
                                marker=dict(
                                    color=y_hat_colors[i]
                                ),
                                showlegend=False,
                        )
                    )
                else:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['R_mean'].rolling(roll_avg).mean(),
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['y_hat'].rolling(roll_avg).mean(),
                            name="Y_hat - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                

        # Add figure title
        fig.update_layout(
            title_text="{} vs {}".format(trafficKPI, covidKPI)
        )

        for dateRange in ranges_train:
            fig = fill_with_areas(dateRange, fig, True)
            
        #for dateRange in ranges_test:
        #    fig = fill_with_areas(dateRange, fig, False)

        return fig

    #app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
    app_timeseries.run_server(mode='inline', port=46002) # debug=True, use_reloader=False

In [None]:
if do_splits:
    name = "Random Forest"
    external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
    app_timeseries = JupyterDash(name, external_stylesheets=external_stylesheets)

    regions_select = list(all_regions_prediction)
    regions_select.append("Tutti")

    app_timeseries.layout = html.Div([
    html.Label(
        [
            "Regione",
            dcc.Dropdown(id="regions",
                         options=[{"label": x, "value": x} for x in regions_select],
                        value="Lombardia",
                        multi=True,
                        clearable=True)
        ],
    ),
    html.Label(
        [
            "Tipologia plot",
            dcc.Dropdown(id="plot_type",
                         options=[{"label": x, "value": x} for x in ["Timeseries", "Errore assoluto", "RMSE", "R2"]],
                        value="Timeseries",
                        clearable=False)
        ]),
    html.Label(
        [
            "Tipologia split",
            dcc.Dropdown(id="split_type",
                         options=[{"label": x, "value": x} for x in list(splits.keys())],
                        value=list(splits.keys())[1],
                        clearable=False)
        ]),

        html.Label(
                    ["Rolling average window ",
                    html.Br(),
                    dcc.Input(
                        id='rolling_avg',
                        type='number',
                        value=1
                    )]
        ),
    html.Div(dcc.Graph(id=name))])

    @app_timeseries.callback(
    Output(name, "figure"), 
    [Input("regions", "value"), Input("plot_type", "value"), Input("split_type", "value"), Input("rolling_avg", "value")])
    def display_rf_results(regions, plot_type, split_type, roll_avg):

        if isinstance(regions, str):
            regions = [regions]

        if regions is None:
            regions = ['Tutti']

        if plot_type is None:
            plot_type = "Timeseries"

        if roll_avg is None or roll_avg <= 0:
            roll_avg = 1

        if "Tutti" in regions:
            regions = all_regions_prediction
        
        if isinstance(splits[split_type]['results'], dict):
            #print("MEDIA")
            df_results = splits[split_type]['results']['mean']
        else:
            #print("NON MEDIA")
            df_results = splits[split_type]['results']

        is_r2, is_rmse, is_timeseries, is_timeseries_error = plot_type == "R2", plot_type == "RMSE", plot_type == "Timeseries", plot_type == "Errore assoluto"

        R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
        y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
        fig = make_subplots(specs=[[{"secondary_y": False}]])

        for i, r in enumerate(regions):

            current_df_results = df_results[df_results.index.get_level_values(1) == r]
            current_df_results_po = current_df_results[current_df_results.index.get_level_values(0) < divider_po]
            current_df_results_so = current_df_results[current_df_results.index.get_level_values(0) >= divider_so]
            x_vals_po, x_vals_so = current_df_results_po.index.get_level_values(0), current_df_results_so.index.get_level_values(0)

            if current_df_results_po.shape[0] > 0 and current_df_results_so.shape[0] > 0:
                error_po = np.abs(current_df_results_po['R_mean']-current_df_results_po['y_hat'])
                error_so = np.abs(current_df_results_so['R_mean']-current_df_results_so['y_hat'])
                
                if is_timeseries_error:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=error_po,
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=error_so,
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    # TODO draw lines
                    #fig.update_yaxes(range=[0, 1])
                elif is_rmse or is_r2:
                    if is_rmse:
                        constant_po, constant_so = np.sqrt(MSE(current_df_results_po['R_mean'], current_df_results_po['y_hat'])), np.sqrt(MSE(current_df_results_so['R_mean'], current_df_results_so['y_hat']))
                    else:
                        constant_po, constant_so = r2_score(current_df_results_po['R_mean'], current_df_results_po['y_hat']), r2_score(current_df_results_so['R_mean'], current_df_results_so['y_hat'])
                    
                    fig.add_trace(
                        go.Scatter(
                                x=x_vals_po,
                                y=np.repeat(constant_po, len(x_vals_po)),
                                name="RMSE {}".format(r),
                                marker=dict(
                                    color=y_hat_colors[i]
                                ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                                x=x_vals_so,
                                y=np.repeat(constant_so, len(x_vals_so)),
                                name="Y - {}".format(r),
                                marker=dict(
                                    color=y_hat_colors[i]
                                ),
                                showlegend=False,
                        )
                    )
                else:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=current_df_results_po['R_mean'].rolling(roll_avg).mean(),
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=current_df_results_po['y_hat'].rolling(roll_avg).mean(),
                            name="Y_hat - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['R_mean'].rolling(roll_avg).mean(),
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['y_hat'].rolling(roll_avg).mean(),
                            name="Y_hat - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                

        # Add figure title
        fig.update_layout(
            title_text="{} vs {}".format(trafficKPI, covidKPI)
        )

        for dateRange in ranges_train:
            fig = fill_with_areas(dateRange, fig, True)
            
        #for dateRange in ranges_test:
        #    fig = fill_with_areas(dateRange, fig, False)

        return fig

    #app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
    app_timeseries.run_server(mode='inline', port=46000) # debug=True, use_reloader=False

In [None]:
X, y = df[features], df[targets]

dates, regions_df = X.reset_index()['Date'], X.reset_index()['Regione']
do_splits = True
min_lag, max_lag = -5, 5  # wrt current shift; so we are predicting output in range (shift+min_lag, shift+max_lag)
lag_outputs = range(min_lag, max_lag)
if do_splits:
    is_rf = False
    is_gbr = False
    is_gaussian = False
    is_linear = False
    is_mlp = True
    all_regions_prediction = regions
    dict_regions = {region: i for i, region in enumerate(all_regions_prediction)}

    splits = {'Random': {}, 'Per periodo': {}, 'Per regione': {}}

    splits['Random']['X_train'], splits['Random']['X_test'], splits['Random']['y_train'], splits['Random']['y_test'] = \
                train_test_split(X, y, test_size = 0.3, random_state = 123)   
    splits['Random']['X'] = X.copy()
    
    splits['Per periodo']['X'] = X.copy()
    idxs_train, idxs_test = X.index.get_level_values(0).intersection(pd.Series(train_dates)), X.index.get_level_values(0).intersection(pd.Series(test_dates))
    splits['Per periodo']['X_train'], splits['Per periodo']['X_test'], splits['Per periodo']['y_train'], \
        splits['Per periodo']['y_test'] = X.loc[idxs_train], X.loc[idxs_test], y.loc[idxs_train], y.loc[idxs_test]
   
    splits['Per periodo']['X_train']['Regione'], splits['Per periodo']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per periodo']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per periodo']['X_test'].index.get_level_values(1).to_list()]
    
    splits['Per periodo']['X']['Regione']=[dict_regions[r] for r in X.index.get_level_values(1).to_list()]

    regions_train, regions_test = train_test_split(all_regions_prediction, test_size = 0.3, random_state = 1)
    idx_train, idx_test = X.index.get_level_values(1).isin(regions_train), X.index.get_level_values(1).isin(regions_test)
    splits['Per regione']['X_train'], splits['Per regione']['X_test'], splits['Per regione']['y_train'], splits['Per regione']['y_test'] = \
        X[idx_train], X[idx_test], y[idx_train], y[idx_test]
    
    splits['Per regione']['X'] = X.copy()
    
    for key in splits.keys():
        if is_rf:
            models = [RandomForestRegressor(n_estimators=200) for _ in lag_outputs]
        elif is_gbr:
            models = [XGBRegressor() for _ in lag_outputs]
        elif is_gaussian:
            models = [GaussianProcessRegressor() for _ in lag_outputs]
        elif is_mlp:
            models = [MLPRegressor(hidden_layer_sizes=(64, 32)) for _ in lag_outputs]
        else:
            models = [LinearRegression() for _ in lag_outputs]
            
        X_train, X_test, y_train, y_test = splits[key]['X_train'], splits[key]['X_test'], splits[key]['y_train'], splits[key]['y_test']
        X = splits[key]['X']
        splits[key]['models'] = []
        splits[key]['rmse'] = []
        splits[key]['results'] = {}
        df_all_results = []
        for i, lag in enumerate(lag_outputs):
            current_y = y_train.groupby('Regione').shift(lag)#.sort_values(by='Date')
            current_X = X_train.copy()#.sort_values(by='Date')
            #current_X = current_X.loc[current_X.index.intersection(current_y.index)]
            #current_X, current_y = current_X.dropna(), current_y.dropna()
            #print(current_X.shape, current_y.shape)
            models[i].fit(current_X, current_y)
            predictions = models[i].predict(X).squeeze()
            rmse = np.sqrt(MSE(y_test, models[i].predict(X_test)))
            splits[key]['rmse'].append(rmse)

            df_preds = pd.DataFrame({'y_hat': predictions, 'Date': dates, 'Regione': regions_df})
            df_preds['lag'] = lag
            df_preds = df_preds.set_index(['lag', 'Date', 'Regione'])
            df_all_results.append(df_preds)
        
        splits[key]['models'] = models
        splits[key]['results']['all_results'] = pd.concat(df_all_results)
        #print(splits[key]['results']['all_results'].loc[(splits[key]['results']['all_results'].index.get_level_values(1)==pd.to_datetime('2020-10-01')) & (splits[key]['results']['all_results'].index.get_level_values(2)=='Lombardia')])
        #splits[key]['all_results'] = df_all_results
        # TODO dovrebbe già ignorare i nan
        df_preds = splits[key]['results']['all_results'].groupby(['Date', 'Regione']).mean()#.set_index(['Date', 'Regione'])
        #print(df_preds.loc[(df_preds.index.get_level_values(0)==pd.to_datetime('2020-10-01')) & (df_preds.index.get_level_values(1)=='Lombardia')])
        #print(y.shape, df_preds.shape)
        splits[key]['results']['mean'] = pd.merge(y, df_preds, left_index=True, right_index=True)

In [None]:
if do_splits:
    name = "Random Forest"
    external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
    app_timeseries = JupyterDash(name, external_stylesheets=external_stylesheets)

    regions_select = list(all_regions_prediction)
    regions_select.append("Tutti")

    app_timeseries.layout = html.Div([
    html.Label(
        [
            "Regione",
            dcc.Dropdown(id="regions",
                         options=[{"label": x, "value": x} for x in regions_select],
                        value="Lombardia",
                        multi=True,
                        clearable=True)
        ],
    ),
    html.Label(
        [
            "Tipologia plot",
            dcc.Dropdown(id="plot_type",
                         options=[{"label": x, "value": x} for x in ["Timeseries", "Errore assoluto", "RMSE"]],
                        value="Timeseries",
                        clearable=False)
        ]),
    html.Label(
        [
            "Tipologia split",
            dcc.Dropdown(id="split_type",
                         options=[{"label": x, "value": x} for x in list(splits.keys())],
                        value=list(splits.keys())[1],
                        clearable=False)
        ]),

        html.Label(
                    ["Rolling average window ",
                    html.Br(),
                    dcc.Input(
                        id='rolling_avg',
                        type='number',
                        value=1
                    )]
        ),
    html.Div(dcc.Graph(id=name))])

    @app_timeseries.callback(
    Output(name, "figure"), 
    [Input("regions", "value"), Input("plot_type", "value"), Input("split_type", "value"), Input("rolling_avg", "value")])
    def display_rf_results(regions, plot_type, split_type, roll_avg):

        if isinstance(regions, str):
            regions = [regions]

        if regions is None:
            regions = ['Tutti']

        if plot_type is None:
            plot_type = "Timeseries"

        if roll_avg is None or roll_avg <= 0:
            roll_avg = 1

        if "Tutti" in regions:
            regions = all_regions_prediction
        
        if isinstance(splits[split_type]['results'], dict):
            print("MEDIA")
            df_results = splits[split_type]['results']['mean']
        else:
            print("NON MEDIA")
            df_results = splits[split_type]['results']

        is_rmse, is_timeseries, is_timeseries_error = plot_type == "RMSE", plot_type == "Timeseries", plot_type == "Errore assoluto"

        R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
        y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
        fig = make_subplots(specs=[[{"secondary_y": False}]])

        for i, r in enumerate(regions):

            current_df_results = df_results[df_results.index.get_level_values(1) == r]
            current_df_results_po = current_df_results[current_df_results.index.get_level_values(0) < divider_po]
            current_df_results_so = current_df_results[current_df_results.index.get_level_values(0) >= divider_so]
            x_vals_po, x_vals_so = current_df_results_po.index.get_level_values(0), current_df_results_so.index.get_level_values(0)

            if current_df_results_po.shape[0] > 0 and current_df_results_so.shape[0] > 0:
                error_po = np.abs(current_df_results_po['R_mean']-current_df_results_po['y_hat'])
                error_so = np.abs(current_df_results_so['R_mean']-current_df_results_so['y_hat'])
                rmse_po, rmse_so = np.sqrt(MSE(current_df_results_po['R_mean'], current_df_results_po['y_hat'])), np.sqrt(MSE(current_df_results_so['R_mean'], current_df_results_so['y_hat']))

                if is_timeseries_error:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=error_po,
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=error_so,
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    # TODO draw lines
                    #fig.update_yaxes(range=[0, 1])
                elif is_rmse:

                    fig.add_trace(
                        go.Scatter(
                                x=x_vals_po,
                                y=np.repeat(rmse_po, len(x_vals_po)),
                                name="RMSE {}".format(r),
                                marker=dict(
                                    color=y_hat_colors[i]
                                ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                                x=x_vals_so,
                                y=np.repeat(rmse_so, len(x_vals_so)),
                                name="Y - {}".format(r),
                                marker=dict(
                                    color=y_hat_colors[i]
                                ),
                                showlegend=False,
                        )
                    )
                else:
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=current_df_results_po['R_mean'].rolling(roll_avg).mean(),
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_po,
                            y=current_df_results_po['y_hat'].rolling(roll_avg).mean(),
                            name="Y_hat - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            ),
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['R_mean'].rolling(roll_avg).mean(),
                            name="Y - {}".format(r),
                            marker=dict(
                                color=R_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                    fig.add_trace(
                        go.Scatter(
                            x=x_vals_so,
                            y=current_df_results_so['y_hat'].rolling(roll_avg).mean(),
                            name="Y_hat - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            ),
                            showlegend=False,
                        )
                    )
                

        # Add figure title
        fig.update_layout(
            title_text="{} vs {}".format(trafficKPI, covidKPI)
        )

        for dateRange in ranges_train:
            fig = fill_with_areas(dateRange, fig, True)
            
        #for dateRange in ranges_test:
        #    fig = fill_with_areas(dateRange, fig, False)

        return fig

    #app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
    app_timeseries.run_server(mode='inline', port=46000) # debug=True, use_reloader=False

In [None]:
# TODO provare senza meteo -> peggiora
# TODO provare senza mobilità -> ovviamente peggiora
# TODO provare a fornire una serie di dati di traffico (ultimi 10 istanti, non solo quello di 15 giorni prima) -> migliora
# TODO provare a fornire dati di covid in input -> migliora
# TODO aggiungere altre regioni -> ok
# TODO generalization over PO - SO: give region in input as categorical -> OK
# TODO try more models: SVM, Polynomial, MLP
# TODO try with rolling avg as target
# TODO get temperatures for rest of regions
# TODO try more COVID features
# TODO select best lag interval for features -> cross-validation
# TODO validate r_mean values eventually checking other methodologies
# TODO classification into zone (gialla, arancio, rossa) -> 
# -> parlare con Andrea
# rolling av = 3 before training ->
# TODO correlation with R_t

In [None]:
len(splits['Per periodo']['X'].columns)

In [None]:
#splits['Per periodo']['model'].get_score(importance_type='gain')
importances = splits['Per periodo']['model'].feature_importances_
std = np.std([tree.feature_importances_ for tree in splits['Per periodo']['model'].estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

indices = indices[:30]

# Print the feature ranking
print("Feature ranking:")

for f in range(splits['Per periodo']['X'].shape[1]):
    pass# print("%d. feature %d %s (%f)" % (f + 1, indices[f], splits['Per periodo']['X'].columns[indices[f]], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [splits['Per periodo']['X_train'].columns[i]for i in indices], rotation='vertical')
#plt.xlim([-1, X.shape[1]+1])
plt.show()

In [None]:
is_rf = True
is_gbr = False
is_gaussian = False
is_linear = False


X, y = df[features], df[targets]

dates, regions_df = X.reset_index()['Date'], X.reset_index()['Regione']
all_regions_prediction = df_temperature['Regione'].unique()
dict_regions = {region: i for i, region in enumerate(all_regions_prediction)}

# TODO ora plotto semplicemente per regione. Sarebbe figo testare la validità quando generalizzando sulle regioni, oppure sul periodo prendendo prima ondata per tutte
splits = {'Random': {}, 'Per periodo': {}, 'Per regione': {}}

splits['Random']['X_train'], splits['Random']['X_test'], splits['Random']['y_train'], splits['Random']['y_test'] = \
            train_test_split(X, y, test_size = 0.3, random_state = 123)   
splits['Random']['X'] = X.copy()

splits['Per periodo']['X_train'], splits['Per periodo']['X_test'], splits['Per periodo']['y_train'], splits['Per periodo']['y_test'] = X[(divider_train_start<=X.index.get_level_values(0)) & (X.index.get_level_values(0)< divider_train_end)], X[X.index.get_level_values(0)>=divider_train_end], y[(divider_train_start<=y.index.get_level_values(0)) & (y.index.get_level_values(0)< divider_train_end)], y[y.index.get_level_values(0)>=divider_train_end]
splits['Per periodo']['X'] = X.copy()
splits['Per periodo']['X_train']['Regione'], splits['Per periodo']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per periodo']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per periodo']['X_test'].index.get_level_values(1).to_list()]
splits['Per periodo']['X']['Regione']=[dict_regions[r] for r in X.index.get_level_values(1).to_list()]

regions_train, regions_test = train_test_split(all_regions_prediction, test_size = 0.3, random_state = 1)
idx_train, idx_test = X.index.get_level_values(1).isin(regions_train), X.index.get_level_values(1).isin(regions_test)
splits['Per regione']['X_train'], splits['Per regione']['X_test'], splits['Per regione']['y_train'], splits['Per regione']['y_test'] = \
    X[idx_train], X[idx_test], y[idx_train], y[idx_test]
#splits['Per regione']['X_train']['Regione'], splits['Per regione']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per regione']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per regione']['X_test'].index.get_level_values(1).to_list()]
splits['Per regione']['X'] = X.copy()
#splits['Per regione']['X']['Regione'] = [dict_regions[r] for r in X.index.get_level_values(1).to_list()]

for model in models:

    for key in splits.keys():
        if is_rf:
            model = RandomForestRegressor(n_estimators=200)
        elif is_gbr:
            model = XGBRegressor()
        elif is_gaussian:
            model = GaussianProcessRegressor()
        else:
            model = LinearRegression()

        X_train, X_test, y_train, y_test = splits[key]['X_train'], splits[key]['X_test'], splits[key]['y_train'], splits[key]['y_test']
        X = splits[key]['X']
        #print(X_train.shape, X_test.shape, X.shape)
        model.fit(X_train, y_train)
        predictions = model.predict(X).squeeze()
        rmse = np.sqrt(MSE(y_test, model.predict(X_test)))
        splits[key]['rmse'] = rmse
        splits[key]['model'] = model

        df_preds = pd.DataFrame({'y_hat': predictions, 'Date': dates, 'Regione': regions_df})
        df_preds = df_preds.set_index(['Date', 'Regione'])

        splits[key]['results'] = pd.merge(y, df_preds, left_index=True, right_index=True)

In [None]:
X.drop(columns=['Regione'], inplace=True)

In [None]:
dates, regions_df = X.reset_index()['Date'], X.reset_index()['Regione']

In [None]:
do_splits = True
if do_splits:

    
    all_regions_prediction = regions
    dict_regions = {region: i for i, region in enumerate(all_regions_prediction)}

    splits = {'Per periodo': {}}
    # 01/10 -> 31/01/2021
    # 01/10 -> 31/12 -> 07/10: X = 06, 05, 04, 03, 02, 01 -> model.fit(X,y)
    # -> amIDuringTrain -> +=1 + loop train 
    # -> otherwise -> X = {05, 04}... U {predicted} + loop train

    idxs_train, idxs_test = X.index.get_level_values(0).intersection(pd.Series(train_dates)), X.index.get_level_values(0).intersection(pd.Series(test_dates))
    splits['Per periodo']['X_train'], splits['Per periodo']['X_test'], splits['Per periodo']['y_train'], \
        splits['Per periodo']['y_test'] = X.loc[idxs_train], X.loc[idxs_test], y.loc[idxs_train], y.loc[idxs_test]
   
    splits['Per periodo']['X'] = X.copy()
    splits['Per periodo']['X_train']['Regione'], splits['Per periodo']['X_test']['Regione'] = [dict_regions[r] for r in splits['Per periodo']['X_train'].index.get_level_values(1).to_list()], [dict_regions[r] for r in splits['Per periodo']['X_test'].index.get_level_values(1).to_list()]
    
    splits['Per periodo']['X']['Regione']=[dict_regions[r] for r in X.index.get_level_values(1).to_list()]

    for key in splits.keys():
        model = XGBRegressor()

        #X_train, X_test, y_train, y_test = splits[key]['X_train'], splits[key]['X_test'], splits[key]['y_train'], splits[key]['y_test']
        current_X = splits[key]['X']
        current_y = y.copy()
        #current_y = splits[key]['y']
        #print(X_train.shape, X_test.shape, X.shape)
        #model.fit(X_train, y_train)
        start = 7
        current_X['error_last_week'] = np.nan
        start_date, end_date = X.index.get_level_values(0).min(), X.index.get_level_values(0).max()
        ranges = pd.date_range(start_date, end_date, freq="14D")[1:]
        
        # i = 1: current_train_y = y[i-1]
        # i = 1: current_X -> error y[i]
        for date in ranges:
            current_train_x, current_train_y = current_X.loc[current_X.index.get_level_values(0)<date], current_y.loc[current_y.index.get_level_values(0)<date]['R_mean']
            previous_y = np.nan
            #for idx in range(current_train_x.shape[0]):
            #x_current, y_current = current_train_x.iloc[idx], current_train_y.iloc[idx]
            #if previous_y is np.nan:
            #    error = 0
            #else:
            #    error = previous_y - y_current
            if date == ranges[0]:
                y_hat = np.zeros(current_train_x.shape[0])
            else:
                #print(current_train_x['error'].isna().sum())
                y_hat = model.predict(current_train_x)
            # k = 1, 2, ..., -> 
            # t = tn, error??? e[n] = y[n]-y_hat[n], e[n] = y[n-week]-y_hat[n-week]
            # 
            df_shift = pd.DataFrame({"R_mean":current_train_y})#current_train_y.copy()
            df_shift['y_hat'] = y_hat
            df_shift = df_shift.shift(7)
            current_train_x['error_last_week'] = np.power(df_shift['R_mean'] - df_shift['y_hat'], 2)#np.power(current_train_y - y_hat, 2)#np.sqrt(MSE(current_train_y, y_hat))
            model.fit(current_train_x, current_train_y)
            current_X.loc[current_train_x['error_last_week'].index, 'error_last_week'] = current_train_x['error_last_week']
            #last_error = current_train_x['error']
            
        '''
        start_date, end_date = X_test.index.get_level_values(0).min(), X_test.index.get_level_values(0).max()
        ranges = pd.date_range(start_date, end_date, freq="7D")[1:]
        
        for date in ranges:
            current_test_x = X_test.loc[X_test.index.get_level_values(0)<date]
            last_week_errors = last_error.loc[last_error.index.get_level_values(0) > date - timedelta(days=7)]
            current_test_x['error'] = last_week_errors
            y_test = model.predict(current_test_x)
            last_week_errors = y_test - 
            # previous_y = y_train#[-1]
            # 
            #current_test_x['error'] = np.pow(current_train_y - y_hat, 2)
            #y_hat = model.predict(current_test_x)
            #current_train_x['error'] = current_train_y - y_hat
            
            for idx in range(current_train_x.shape[0]):
                x_current = current_train_x[idx]
                y_hat = model.predict(x_current)
                error = previous_y - y_hat
                x_current['error'] = error
                model.fit(x_current, y_current)
                previous_y = y_current
            
        '''
        '''
        predictions = model.predict(X).squeeze()
        predictions_test = model.predict(X_test)
        rmse = np.sqrt(MSE(y_test, predictions_test))
        splits[key]['rmse'] = rmse
        splits[key]['r2'] = r2_score(y_test, predictions_test)
        splits[key]['model'] = model

        df_preds = pd.DataFrame({'y_hat': predictions, 'Date': dates, 'Regione': regions_df})
        df_preds = df_preds.set_index(['Date', 'Regione'])

        splits[key]['results'] = pd.merge(y, df_preds, left_index=True, right_index=True)
        '''

In [None]:
ranges

In [None]:
y_hat = model.predict(current_X)
current_y["y_hat"]=y_hat

In [None]:
df_shift

In [None]:
# at time t: (state = {errors_from_last_week}, mobility_time_t_minus_7, ...) -> covid_time_t
current_X.loc[current_X.index.get_level_values(0)>=pd.to_datetime('2021-01-19')]['error_last_week'].isna().sum()

In [None]:
for r in regions:
    df_region = current_y.xs(r, level=1)
    predictions, real_values = df_region['y_hat'], df_region['R_mean']#.plot()
    fig = make_subplots(specs=[[{"secondary_y": False}]])
    
    fig.add_trace(
        go.Scatter(
                x=df_region.index,
                y=predictions,
                line=dict(color='rgb(255,0,0)'),
                mode='lines',
                name="Prediction"
            )
    )
    
    fig.add_trace(
        go.Scatter(
                x=df_region.index,
                y=real_values,
                line=dict(color='rgb(0,0,255)'),
                mode='lines',
                name="Target"
            )
    )
    fig.show()

In [None]:
current_X = splits[key]['X']
current_y = y.copy()
#model.predict()

In [None]:
current_X.tail()

In [None]:
ranges

In [None]:
current_y['error'] = (current_y['R_mean'] - current_y['y_hat'])**2

In [None]:
current_y.loc[abs(current_y.error-0.003474) <= 0.0001]

In [None]:
current_X

In [None]:
test_ranges=ranges
current_X_lombardia, current_y_lombardia = current_X.loc[current_X.index.get_level_values(1)=="Lombardia"], current_y.loc[current_y.index.get_level_values(1)=="Lombardia"]
last_length = 0
model = XGBRegressor()
for i, date in enumerate(test_ranges):
    current_train_x, current_train_y = current_X_lombardia.loc[current_X_lombardia.index.get_level_values(0)<date], current_y_lombardia.loc[current_y_lombardia.index.get_level_values(0)<date]['R_mean']
    if current_train_x.shape[0] > last_length:
        last_length = current_train_x.shape[0]
        current_train_x.sort_index(level=0, inplace=True)
        current_train_y.sort_index(level=0, inplace=True)
        #for idx in range(current_train_x.shape[0]):
        #x_current, y_current = current_train_x.iloc[idx], current_train_y.iloc[idx]
        #if previous_y is np.nan:
        #    error = 0
        #else:
        #    error = previous_y - y_current
        print("{}/{} -> {}".format(i+1, len(test_ranges), date))
        if date == ranges[0]:
            y_hat = np.zeros(current_train_x.shape[0])
        else:
            #print(current_train_x['error'].isna().sum())
            y_hat = model.predict(current_train_x)
        # k = 1, 2, ..., -> 
        # t = tn, error??? e[n] = y[n]-y_hat[n], e[n] = y[n-week]-y_hat[n-week]
        # 
        df_shift = pd.DataFrame({"R_mean":current_train_y})#current_train_y.copy()
        df_shift['y_hat'] = y_hat
        df_shift = df_shift.shift(7)
        #current_train_x['error_last_week'] = np.power(df_shift['R_mean'] - df_shift['y_hat'], 2)#np.power(current_train_y - y_hat, 2)#np.sqrt(MSE(current_train_y, y_hat))
        model.fit(current_train_x, current_train_y)
        #current_X_lombardia.loc[current_train_x['error_last_week'].index, 'error_last_week'] = current_train_x['error_last_week']

        #print("Num. of Y hat: {}".format(y_hat.shape))
        #print("error_last_week: {}".format(current_train_x['error_last_week'].index.get_level_values(0).max()))
        #df_plot = pd.DataFrame({"prediction": y_hat, "target": current_train_y})

        #predictions, real_values = df_region['y_hat'], df_region['R_mean']#.plot()
        fig = make_subplots(specs=[[{"secondary_y": False}]])

        fig.add_trace(
            go.Scatter(
                    x=current_train_y.index.get_level_values(0),
                    y=y_hat,
                    line=dict(color='rgb(255,0,0)'),
                    mode='lines',
                    name="Prediction"
                )
        )

        fig.add_trace(
            go.Scatter(
                    x=current_train_y.index.get_level_values(0),
                    y=current_train_y,
                    line=dict(color='rgb(0,0, 255)'),
                    mode='lines',
                    name="Prediction"
                )
        )
        fig.show()
    

In [None]:
current_X_lombardia.head()

In [None]:
current_X_lombardia, current_y_lombardia = X.loc[X.index.get_level_values(1)=="Lombardia"], y.loc[y.index.get_level_values(1)=="Lombardia"]
last_length = 0

current_X_lombardia = current_X_lombardia.sort_index()
current_y_lombardia = current_y_lombardia.sort_index()

train_dates, test_dates = [], []
ranges_train_ts = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]

for date_val in current_X_lombardia.index.get_level_values(0).unique():
    if any(date_val in x for x in ranges_train_ts):
        train_dates.append(date_val)
    else:
        test_dates.append(date_val)

idxs_train, idxs_test = current_X_lombardia.index.get_level_values(0).intersection(pd.Series(train_dates)), current_X_lombardia.index.get_level_values(0).intersection(pd.Series(test_dates))

X_train_lom, X_test_lom = current_X_lombardia.loc[idxs_train], current_X_lombardia.loc[idxs_test]
y_train_lom = current_y_lombardia.loc[idxs_train]
y_train_lom = y_train_lom.sort_index(level=0)
X_train_lom = X_train_lom.sort_index(level=0)
#print(X_train_lom.head())
X_train_lom['Rmean_1'] = y_train_lom.shift(1)
model = XGBRegressor()
model.fit(X_train_lom, y_train_lom)

y_hat = model.predict(X_train_lom)

df_plot = y_train_lom
df_plot = df_plot.sort_index(level=0)
df_plot['y_hat'] = y_hat

fig = make_subplots(specs=[[{"secondary_y": False}]])
df_plot.sort_index(inplace=True)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['y_hat'],
            line=dict(color='rgb(255,0,0)'),
            mode='lines',
            name="Prediction"
        )
)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['R_mean'],
            line=dict(color='rgb(0,0, 255)'),
            mode='lines',
            name="Real values"
        )
)
fig.show()

#test_dates = X_test_lom.index.get_level_values(0).sort_values()

last_output = y_train_lom.xs(train_dates[-1], level=0)
test_dates = [t for t in test_dates if t > divider_so]

for date in test_dates:
    current_X_test = X_test_lom.xs(date, level=0)
    current_X_test['Rmean_1'] = last_output
    #print(current_X_test)
    prediction = model.predict(current_X_test)
    y_hat = np.append(y_hat, prediction)
    last_output = prediction
    # now train 
    
# X = {mobility_t_1, mobility_t_2, mobility_t_3, }

In [None]:
#y_test_lom = current_y_lombardia.loc[idxs_test]
#y_test_lom = y_test_lom.sort_index(level=0)
#print(y_hat.shape, y_train_lom.shape, y_test_lom.shape)
#print(y_train_lom.index.get_level_values(0))
#print(pd.concat([y_train_lom.to_frame(), y_test_lom.to_frame()]))
df_plot = current_y_lombardia.loc[current_y_lombardia.index.get_level_values(0) >= pd.to_datetime('2020-10-01')]
df_plot = df_plot.sort_index(level=0)
df_plot['y_hat'] = y_hat
df_plot

In [None]:
start_date_rmse = min(test_dates)
idx_rmse = df_plot.index.get_level_values(0) >= start_date_rmse
RMSE = np.sqrt(MSE(df_plot.loc[idx_rmse, 'y_hat'], df_plot.loc[idx_rmse, 'R_mean']))

In [None]:
R2 = r2_score(df_plot.loc[idx_rmse, 'y_hat'], df_plot.loc[idx_rmse, 'R_mean'])

In [None]:
RMSE, R2

In [None]:
fig = make_subplots(specs=[[{"secondary_y": False}]])
df_plot.sort_index(level=0, inplace=True)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['y_hat'],
            line=dict(color='rgb(255,0,0)'),
            mode='lines',
            name="Prediction"
        )
)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['R_mean'],
            line=dict(color='rgb(0,0, 255)'),
            mode='lines',
            name="Real values"
        )
)
fig.show()

In [None]:
# TODO non da risultati migliori; farlo su tutte le regioni? Bisogna cambiare il loop. Prima, fare train su tutte le regioni; poi fare il loop per regione
# TODO fare training settimana per settimana in fase di testing, oltre che test

In [None]:
train_dates, test_dates = [], []
ranges_train_ts = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]

for date_val in current_X_lombardia.index.get_level_values(0).unique():
    if any(date_val in x for x in ranges_train_ts):
        train_dates.append(date_val)
    else:
        test_dates.append(date_val)

idxs_train, idxs_test = current_X_lombardia.index.get_level_values(0).intersection(pd.Series(train_dates)), current_X_lombardia.index.get_level_values(0).intersection(pd.Series(test_dates))

df_traffic_ts = df_traffic_predictions.loc[idxs_train.union(idxs_test)]
df_covid_ts = df_covid_predictions.loc[idxs_train.union(idxs_test)]
df_temperature_ts = df_temperature.loc[idxs_train.union(idxs_test)]
# region
df_traffic_ts = df_traffic_ts.loc[df_traffic_ts.index.get_level_values(1)=="Lombardia"]
df_covid_ts = df_covid_ts.loc[df_covid_ts.index.get_level_values(1)=="Lombardia"]
df_temperature_ts = df_temperature_ts.loc[df_temperature_ts.index.get_level_values(1)=="Lombardia"]
#print("SHAPE: {}".format(df_traffic_ts.shape))
farsightness = 14
delta_features = 14
lags = range(delta_features) # 1 -> 0; 2 ->
df_ts = pd.DataFrame()
trafficKPIs = [col for col in df_traffic_ts.columns if "smooth" in col]
covidKPIs = [col for col in df_covid_ts.columns if "mean" in col and "delta" not in col]
temperatureKPIs = [col for col in df_temperature_ts.columns if "min" in col]
features = []
targets = []
target_col = "R_mean"
for lag in lags:
    lag_shift = lag+1
    for col in trafficKPIs:
        feature = "{}_{}".format(col, lag_shift)
        df_ts[feature] = df_traffic_ts.shift(lag_shift)[col]
        features.append(feature)
    for col in covidKPIs:
        feature = "{}_{}".format(col, lag_shift)
        df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
        features.append(feature)
    for col in temperatureKPIs:
        feature = "{}_{}".format(col, lag_shift)
        df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
        features.append(feature)
    target = "target_{}".format(lag)
    targets.append(target)
    df_ts[target] = df_covid_ts.shift(-1*lag)[target_col]
    
df_ts.dropna(inplace=True)
df_ts = df_ts.reset_index(level='Regione')
df_ts = df_ts[targets+features]
test_dates, train_dates = [], []
for date_val in df_ts.index.unique():
    if any(date_val in x for x in ranges_train_ts):
        train_dates.append(date_val)
    else:
        test_dates.append(date_val)

test_dates = [t for t in test_dates if t > divider_so]

df_ts_train, df_ts_test = df_ts.loc[train_dates], df_ts.loc[test_dates]

#print("TRAIN: ", df_ts_train.head(3))
#print("TEST: ", df_ts_test.head(3))

model = MultiOutputRegressor(XGBRegressor())#.fit(X, y)
#df_ts_train = df_ts_train.resample('{}D'.format(farsightness))
train_dates_every_n = pd.date_range(start=train_dates[0], end=train_dates[-1], freq="{}D".format(farsightness))
df_ts_train = df_ts_train.loc[train_dates_every_n]
print(df_ts_train.head())
model.fit(df_ts_train[features], df_ts_train[targets])

y_hat = model.predict(df_ts_train[features])
print(y_hat.shape)
'''

df_plot = y_train_lom
df_plot = df_plot.sort_index(level=0)
df_plot['y_hat'] = y_hat

fig = make_subplots(specs=[[{"secondary_y": False}]])
df_plot.sort_index(inplace=True)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['y_hat'],
            line=dict(color='rgb(255,0,0)'),
            mode='lines',
            name="Prediction"
        )
)

fig.add_trace(
    go.Scatter(
            x=df_plot.index.get_level_values(0),
            y=df_plot['R_mean'],
            line=dict(color='rgb(0,0, 255)'),
            mode='lines',
            name="Real values"
        )
)
fig.show()

#test_dates = X_test_lom.index.get_level_values(0).sort_values()

last_output = y_train_lom.xs(train_dates[-1], level=0)

for date in test_dates:
    current_X_test = X_test_lom.xs(date, level=0)
    current_X_test['Rmean_1'] = last_output
    #print(current_X_test)
    prediction = model.predict(current_X_test)
    y_hat = np.append(y_hat, prediction)
    last_output = prediction
    # now train 
'''
# X = {mobility_t_1, mobility_t_2, mobility_t_3, }

In [None]:
i = 0
test_dates_every_n = pd.date_range(start=test_dates[0], end=test_dates[-1], freq="{}D".format(farsightness))
test_predictions = np.array([])
for i, t in enumerate(test_dates_every_n):
    if i < len(test_dates_every_n)-1:
        next_t = test_dates_every_n[i+1]
        print("{} -> {}".format(t, next_t))
        X_test_ts = df_ts_test.loc[t, features]
        print(X_test_ts)
        predictions = model.predict(X_test_ts.to_numpy().reshape(1, -1))
        test_predictions = np.append(test_predictions, predictions)
        # 2 weeks later: today = next_t
        # dates_fit = pd.date_range(t, next_t)
        # 2020-10-31 -> 2020-11-14
        # 01/11: target = {01/11, 02/11, 03/11, ..., 14/11}
        X_test_fit, y_test_fit = df_ts_test.loc[t, features], df_ts_test.loc[t, targets]
        model.fit(X_test_fit.to_numpy().reshape(1, -1), y_test_fit.to_numpy().reshape(1,-1))
        #X_test_fit, y_test_fit = df_ts_test.loc[dates_fit, features], df_ts_test.loc[dates_fit, targets]
        #model.fit(X_test_fit, y_test_fit)
        print("FIT ON {} -> {}; {}".format(dates_fit[0], dates_fit[-1], X_test_fit.shape))

In [None]:
# evaluation on single region
df_plot = pd.DataFrame({'target': df_ts_test.iloc[0:test_predictions.shape[0]]['target_0']})
df_plot['y_hat'] = test_predictions
df_plot['error'] = (df_plot['y_hat']-df_plot['target']).abs()
print(np.sqrt(MSE(df_plot['y_hat'], df_plot['target'])))
print(r2_score(df_plot['y_hat'], df_plot['target']))
df_plot.plot()

In [None]:
train_dates, test_dates = [], []
ranges_train_ts = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-12-01'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]

model = MultiOutputRegressor(RandomForestRegressor())
farsightness = 7
delta_features = 7
lags = range(delta_features)

trafficKPIs = []#[col for col in df_traffic_predictions.columns if "smooth" in col]
covidKPIs = [col for col in df_covid_predictions.columns if "mean" in col and "delta" not in col]
temperatureKPIs = []# [col for col in df_temperature.columns if "min" in col]

params = {'objective': 'reg:linear'}

# train during PO
train_also_PO = True
if train_also_PO:
    print("ALSO TRAIN DURING PO")
    for region in regions:
        # filter ts by region
        df_traffic_ts = df_traffic_predictions.loc[df_traffic_predictions.index.get_level_values(1)==region]
        df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region]
        df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region]
        # create dataframe with last values
        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = "R_mean"
        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_traffic_ts.shift(lag_shift)[col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
                features.append(feature)
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_covid_ts.shift(-1*lag)[target_col]

        df_ts = df_ts[targets+features]
        df_ts.dropna(inplace=True)
        all_dfs.append(df_ts.copy())
        df_ts = df_ts.reset_index(level='Regione')
        test_dates, train_dates = [], []

        for date_val in df_ts.index.unique():
            if any(date_val in x for x in ranges_train_ts):
                train_dates.append(date_val)

        df_ts_train = df_ts.loc[train_dates]
        xgtrain = xgb.DMatrix(train.values, target.values)
        xgtest = xgb.DMatrix(test.values)
        model.fit(df_ts_train[features], df_ts_train[targets])
    
# then during SO
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}
all_dfs = []

for region in regions:
    # filter ts by region
    df_traffic_ts = df_traffic_predictions.loc[df_traffic_predictions.index.get_level_values(1)==region]
    df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region]
    df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region]
    # create dataframe with last values
    df_ts = pd.DataFrame()
    features = []
    targets = []
    target_col = "R_mean"
    for lag in lags:
        lag_shift = lag+1
        for col in trafficKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_traffic_ts.shift(lag_shift)[col]
            features.append(feature)
        for col in covidKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
            features.append(feature)
        for col in temperatureKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
            features.append(feature)
        target = "target_{}".format(lag)
        targets.append(target)
        df_ts[target] = df_covid_ts.shift(-1*lag)[target_col]

    df_ts = df_ts[targets+features]
    df_ts.dropna(inplace=True)
    all_dfs.append(df_ts.copy())
    df_ts = df_ts.reset_index(level='Regione')
    test_dates, train_dates = [], []
    
    for date_val in df_ts.index.unique():
        if any(date_val in x for x in ranges_train_ts):
            train_dates.append(date_val)
        else:
            test_dates.append(date_val)

    test_dates = [t for t in test_dates if t > divider_so]

    df_ts_train, df_ts_test = df_ts.loc[train_dates], df_ts.loc[test_dates]
    #print("BUILT DATASET: train = {}, test = {}".format(df_ts_train.loc[train_dates[0], "target_0"], df_ts_test.loc[test_dates[0], "R_mean_14"]))

    #print("TRAIN: ", df_ts_train.head(3))
    #print("TEST: ", df_ts_test.head(3))

    #df_ts_train = df_ts_train.resample('{}D'.format(farsightness))
    #train_dates_every_n = pd.date_range(start=train_dates[0], end=train_dates[-1], freq="{}D".format(farsightness))
    #train_dates_region_every_n[region] = train_dates_every_n
    test_dates_every_n = pd.date_range(start=test_dates[0], end=test_dates[-1], freq="{}D".format(farsightness)).to_pydatetime()
    test_dates_region_every_n[region] = test_dates_every_n
    train_dates_region[region], test_dates_region[region] = train_dates, test_dates
    df_ts_test_region[region] = df_ts_test
    #df_ts_train = df_ts_train.loc[train_dates_every_n]
    #print(df_ts_train.head())
    model.fit(df_ts_train[features], df_ts_train[targets], xgb_model=model)
    #model.train(params, df_ts_train,)
    #model.partial_fit(df_ts_train[features], df_ts_train[targets])
    
# X = {mobility_t_1, mobility_t_2, mobility_t_3, }

In [None]:
test_predictions = {}

for region in regions:
    test_predictions_region = np.array([])
    test_dates_every_n = test_dates_region_every_n[region]
    df_ts_test = df_ts_test_region[region]
    print("{}: {}".format(region, df_ts_test.shape))
    for i, t in enumerate(test_dates_every_n):
        if i < len(test_dates_every_n)-1:
            next_t = test_dates_every_n[i+1]
            print("{}: {} -> {}".format(region, t, next_t))
            X_test_ts = df_ts_test.loc[t, features]
            X_test_ts.sort_index(inplace=True)
            predictions = model.predict(X_test_ts.to_numpy().reshape(1, -1))
            test_predictions_region = np.append(test_predictions_region, predictions)
            # 2 weeks later: today = next_t
            dates_fit = pd.date_range(t, next_t)
            # 2020-10-31 -> 2020-11-14
            # 01/11: target = {01/11, 02/11, 03/11, ..., 14/11}
            X_test_fit, y_test_fit = df_ts_test.loc[t, features], df_ts_test.loc[t, targets]
            model.fit(X_test_fit.to_numpy().reshape(1, -1), y_test_fit.to_numpy().reshape(1,-1))
            #X_test_fit, y_test_fit = df_ts_test.loc[dates_fit, features], df_ts_test.loc[dates_fit, targets]
            #model.fit(X_test_fit, y_test_fit)
            print("{}: FIT ON {} -> {}; {}".format(region, dates_fit[0], dates_fit[-1], X_test_fit.shape))
    test_predictions[region] = test_predictions_region

In [None]:
test_predictions

In [None]:
# evaluation on single region
for region in regions:
    current_df_ts_test = df_ts_test_region[region]
    current_test_predictions = test_predictions[region]
    df_plot = pd.DataFrame({'target': current_df_ts_test.iloc[0:current_test_predictions.shape[0]]['target_0']})
    df_plot['y_hat'] = current_test_predictions
    df_plot['error'] = (df_plot['y_hat']-df_plot['target']).abs()
    rmse, r2 = np.sqrt(MSE(df_plot['y_hat'], df_plot['target'])), r2_score(df_plot['y_hat'], df_plot['target'])
    df_plot.plot(title="{}: RMSE -> {}, R2 -> {}".format(region, rmse, r2))

In [None]:
train_dates, test_dates = [], []
ranges_train_ts = [pd.date_range(pd.to_datetime('2020-10-01'), pd.to_datetime('2020-12-01'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]

farsightness = 6
delta_features = 7
lags = range(delta_features)
lags_target = range(farsightness)

trafficKPIs = [col for col in df_traffic_predictions.columns if "smooth" in col]
covidKPIs = [col for col in df_covid_predictions.columns if "mean" in col and "delta" not in col]
temperatureKPIs = []# [col for col in df_temperature.columns if "min" in col]

params = {'objective': 'reg:squarederror'}

regions_to_train = regions
models_regions = {}

# train during PO
train_also_PO = True
if train_also_PO:
    print("ALSO TRAIN DURING PO")
    ranges_train_PO = [pd.date_range(pd.to_datetime('2020-03-01'), pd.to_datetime('2020-07-01'))]#, pd.date_range(pd.to_datetime('2020-11-01'), pd.to_datetime('2020-11-30'))]#, pd.date_range(pd.to_datetime('2020-12-01'), pd.to_datetime('2020-12-15'))]
    for region in regions_to_train:
        # filter ts by region
        df_traffic_ts = df_traffic_predictions.loc[df_traffic_predictions.index.get_level_values(1)==region]
        df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region]
        df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region]
        
        df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
        #df_covid_ts = df_covid_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
        df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
    
        # create dataframe with last values
        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = "R_mean"
        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_traffic_ts.shift(lag_shift)[col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
                features.append(feature)
        for lag in lags_target:
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_covid_ts.shift(-1*lag)[target_col]

        df_ts = df_ts[targets+features]
        df_ts.dropna(inplace=True)
        all_dfs.append(df_ts.copy())
        df_ts = df_ts.reset_index(level='Regione')
        test_dates, train_dates = [], []

        for date_val in df_ts.index.unique():
            if any(date_val in x for x in ranges_train_ts):
                train_dates.append(date_val)

        df_ts_train = df_ts.loc[train_dates]
        xgtrain = DMatrix(df_ts_train[features].values, df_ts_train[targets[-1]].values)
        #model.fit(df_ts_train[features], df_ts_train[targets])
        #xgtest = xgb.DMatrix(test.values)
        #model.train(params, xgtrain)
        model = train(params, xgtrain)
        models_regions[region] = model
    
# then during SO
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}
all_dfs = []

for region in regions_to_train:
    # filter ts by region
    df_traffic_ts = df_traffic_predictions.loc[df_traffic_predictions.index.get_level_values(1)==region]
    df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region]
    df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region]
    
    df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
    #df_covid_ts = df_covid_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
    df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
    # create dataframe with last values
    df_ts = pd.DataFrame()
    features = []
    targets = []
    target_col = "R_mean"
    for lag in lags:
        lag_shift = lag+1
        for col in trafficKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_traffic_ts.shift(lag_shift)[col]
            features.append(feature)
        for col in covidKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_covid_ts.shift(lag_shift)[col]
            features.append(feature)
        for col in temperatureKPIs:
            feature = "{}_{}".format(col, lag_shift)
            df_ts[feature] = df_temperature_ts.shift(lag_shift)[col]
            features.append(feature)
    for lag in lags_target:
        target = "target_{}".format(lag)
        targets.append(target)
        df_ts[target] = df_covid_ts.shift(-1*lag)[target_col]

    df_ts = df_ts[targets+features]
    df_ts.dropna(inplace=True)
    all_dfs.append(df_ts.copy())
    df_ts = df_ts.reset_index(level='Regione')
    test_dates, train_dates = [], []
    
    for date_val in df_ts.index.unique():
        if any(date_val in x for x in ranges_train_ts):
            train_dates.append(date_val)
        else:
            test_dates.append(date_val)

    test_dates = [t for t in test_dates if t > divider_so]

    df_ts_train, df_ts_test = df_ts.loc[train_dates], df_ts.loc[test_dates]
    #print("BUILT DATASET: train = {}, test = {}".format(df_ts_train.loc[train_dates[0], "target_0"], df_ts_test.loc[test_dates[0], "R_mean_14"]))

    #print("TRAIN: ", df_ts_train.head(3))
    #print("TEST: ", df_ts_test.head(3))

    #df_ts_train = df_ts_train.resample('{}D'.format(farsightness))
    #train_dates_every_n = pd.date_range(start=train_dates[0], end=train_dates[-1], freq="{}D".format(farsightness))
    #train_dates_region_every_n[region] = train_dates_every_n
    test_dates_every_n = pd.date_range(start=test_dates[0], end=test_dates[-1], freq="{}D".format(farsightness)).to_pydatetime()
    test_dates_region_every_n[region] = test_dates_every_n
    train_dates_region[region], test_dates_region[region] = train_dates, test_dates
    df_ts_test_region[region] = df_ts_test
    #df_ts_train = df_ts_train.loc[train_dates_every_n]
    #print(df_ts_train.head())
    #model.fit(df_ts_train[features], df_ts_train[targets], xgb_model=model)
    xgtrain = DMatrix(df_ts_train[features].values, df_ts_train[targets[-1]].values)
    try:
        models_regions[region] = train(params, xgtrain, xgb_model=models_regions[region])
    except:
        model = train(params, xgtrain)
        models_regions[region] = model
    #model = train(params, xgtrain, xgb_model=model)
    #model.train(params, df_ts_train,)
    #model.partial_fit(df_ts_train[features], df_ts_train[targets])
    
# X = {mobility_t_1, mobility_t_2, mobility_t_3, }

In [None]:
'''
test_predictions = {}
test_targets = {}

accumulated_t = []
accumulated_predictions = []

for region in regions_to_train:
    test_predictions_region = np.array([])
    test_target_region = np.array([])
    test_dates = test_dates_region[region]
    df_ts_test = df_ts_test_region[region]
    print("{}: {}".format(region, df_ts_test.shape))
    for i, t in enumerate(test_dates):
        X_test_ts = df_ts_test.loc[t, features]
        X_test_ts.sort_index(inplace=True)
        
        predictions = models_regions[region].predict(DMatrix(X_test_ts.to_numpy().reshape(1, -1)))
        test_predictions_region = np.append(test_predictions_region, predictions)
        
        X_test_fit, y_test_fit = df_ts_test.loc[t, features], df_ts_test.loc[t, targets[-1]]
        test_target_region = np.append(test_target_region, y_test_fit)
        
        if i % farsightness == farsightness - 1 or i == len(test_dates) - 1:
            xgtrain = DMatrix(df_ts_test.loc[accumulated_t, features].values, df_ts_test.loc[accumulated_t, targets[-1]].to_numpy())
            models_regions[region] = train(params, xgtrain, xgb_model=models_regions[region])
            accumulated_t, accumulated_predictions = [], []
        else:
            accumulated_predictions.append(predictions)
            accumulated_t.append(t)
            
    test_predictions[region] = test_predictions_region
    test_targets[region] = test_target_region
'''

In [None]:
test_predictions = {}
test_targets = {}

for region in regions_to_train:
    test_predictions_region = np.array([])
    test_target_region = np.array([])
    test_dates = test_dates_region[region]
    df_ts_test = df_ts_test_region[region]
    print("{}: {}".format(region, df_ts_test.shape))
    for i, t in enumerate(test_dates):
        X_test_ts = df_ts_test.loc[t, features]
        X_test_ts.sort_index(inplace=True)
        
        predictions = models_regions[region].predict(DMatrix(X_test_ts.to_numpy().reshape(1, -1)))
        test_predictions_region = np.append(test_predictions_region, predictions)
        
        X_test_fit, y_test_fit = df_ts_test.loc[t, features], df_ts_test.loc[t, targets[-1]]
        test_target_region = np.append(test_target_region, y_test_fit)
        
        if i < len(test_dates)-1:
            xgtrain = DMatrix(X_test_fit.to_numpy().reshape(1, -1), y_test_fit.reshape(1, -1))
            models_regions[region] = train(params, xgtrain, xgb_model=models_regions[region])
            
    test_predictions[region] = test_predictions_region
    test_targets[region] = test_target_region

In [None]:
for region in regions_to_train:
    df_plot_region = pd.DataFrame({"predictions": test_predictions[region], "target": test_targets[region]})
    rmse, r2 = np.sqrt(MSE(df_plot_region['predictions'], df_plot_region['target'])), r2_score(df_plot_region['predictions'], df_plot_region['target'])
    
    df_plot_region.plot(title="{}: {:.3f}, {:.3f}".format(region, rmse, r2))

In [None]:
# TODO integrare abruzzo