In [1]:
import os
import itertools

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import random

from os import listdir
from os.path import isfile, join

plt.style.use('seaborn-white')

%matplotlib inline

from scipy.stats import gamma, poisson

import epyestim
import epyestim.covid19 as covid19
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE, r2_score
from xgboost import XGBRegressor, DMatrix, train
from sklearn.multioutput import MultiOutputRegressor

from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from pykalman import KalmanFilter

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, TimeDistributed, RepeatVector
from keras.callbacks import EarlyStopping

import plotly.express as px
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from functools import partial


to_sum_KPIs = ['totale_casi_giornalieri', 'terapia_intensiva_giornalieri', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri']
covidKPIsPrecompute = ['%pos']+to_sum_KPIs
trafficKPIsPrecompute = ['Handover', 'Download vol.', 'Upload vol.', '#Users']

# sums regions such as trento + bolzano
def sumRegions(df, dateCol = 'Date', regionCol='Regione', cols = to_sum_KPIs, region1 = "P.A. Bolzano", region2 = "P.A. Trento", regionNew = "Trentino-Alto Adige"):
    dfRegion1, dfRegion2 = df.loc[df[regionCol] == region1], df.loc[df[regionCol] == region2]
    dfRegion1.set_index(dateCol, inplace=True)
    dfRegion2.set_index(dateCol, inplace=True)
    newVals = dfRegion1[to_sum_KPIs]+dfRegion2[to_sum_KPIs]
    newVals.reset_index(inplace=True)
    newVals['Regione'] = regionNew
    df = df.loc[(df[regionCol] != region1) & (df[regionCol] != region2)]
    return df.append(newVals)

# adds italy as cumulative over days
def addItalyData(df, cols):
    dfTemp = df.resample('D', on='Date').sum().reset_index()
    dfTemp['Regione']='Italia'
    dfTemp = dfTemp[cols]
    return pd.concat([df, dfTemp])

def fill_with_areas(dateRange, fig, is_train):
    if is_train:
        color = 'rgba(255, 0, 0, 0.2)'
    else:
        color = 'rgba(0, 0, 255, 0.2)'
    fig.add_shape(type="rect",
        yref="paper",
        x0=dateRange[0], y0=0,
        x1=dateRange[-1], y1=1,
        line=dict(
            width=0,
        ),
        fillcolor=color,
    )
    
    return fig

data_path = "/Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/"
by_region_path = "{}By_Region/".format(data_path)
saved = "{}saved/".format(data_path)
traffic_daily = "{}TS_1800_daily.pkl".format(saved)
region_traffic_daily = "{}all.pkl".format(saved)
covid = "{}covid/".format(data_path)
covid_daily = "{}covid_regioni.csv".format(covid)

# Preprocessing

## Handle temperature data

### Import

In [155]:
meteo_path = "{}meteo/".format(data_path)
dfs_filenames = [f for f in listdir(meteo_path) if isfile(join(meteo_path, f))]
dfs = []
path_temperature_predictions = "{}predictions/temperatures.csv".format(saved)
SAVE_TEMPERATURE = False
if SAVE_TEMPERATURE:
    for f in dfs_filenames:
        splits = f.split("_")
        if len(splits) == 2 and "." in splits[1]:
            filename = "{}{}".format(meteo_path, f)
            current_df = pd.read_csv(filename)
            region_name = splits[0]
            #if "rentino" not in region_name and "osta" not in region_name:
            month = splits[1][4:].split(".")[0]
            current_df['Regione'] = [r for r in regions_covid if region_name in r.lower()][0]
            current_df['month'] = int(month)
            current_df['year'] = int(2021 if "2021" in filename else 2020)
            dfs.append(current_df)
        df_temperature = pd.concat(dfs)
        df_temperature['Date'] = df_temperature.apply(lambda x: pd.to_datetime("{}/{}/{}".format(x.year, x.month, int(x.date.split(" ")[1]))), axis=1)
        df_temperature.set_index(['Date', 'Regione'], inplace=True)
        df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
        df_temperature.to_csv(path_temperature_predictions)
else:
    df_temperature = pd.read_csv(path_temperature_predictions)
    df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
    df_temperature.set_index(['Date', 'Regione'], inplace=True)
    
regions_temperature = df_temperature.index.get_level_values(1).unique()
regions = regions_temperature

## Handle COVID data

### Import

In [61]:
recompute_rt = False
import_covid  = False
path_covid = "{}covid.csv".format(saved)

if import_covid:
    df_covid = pd.read_csv(covid_daily)
    if "Regione" not in df_covid.columns:
        df_covid.rename(columns={'denominazione_regione': 'Regione'}, inplace=True)
        df_covid['tamponi_giornalieri'] = df_covid.groupby([
                        'Regione'])['tamponi'].diff()
        df_covid.loc[df_covid['tamponi_giornalieri'].isna() ,
                               'tamponi_giornalieri'] = df_covid['tamponi']


        df_covid['deceduti_giornalieri'] = df_covid.groupby([
                            'Regione'])['deceduti'].diff()
        df_covid.loc[df_covid['deceduti_giornalieri'].isna() ,
                               'deceduti_giornalieri'] = df_covid['deceduti']

        df_covid['terapia_intensiva_giornalieri'] = df_covid.groupby([
                            'Regione'])['terapia_intensiva'].diff()
        df_covid.loc[df_covid['terapia_intensiva_giornalieri'].isna() ,
                               'terapia_intensiva_giornalieri'] = df_covid['terapia_intensiva']

        df_covid['totale_casi_giornalieri'] = df_covid.groupby([
                            'Regione'])['totale_casi'].diff()
        df_covid.loc[df_covid['totale_casi_giornalieri'].isna() ,
                               'totale_casi_giornalieri'] = df_covid['totale_casi']
    covid_cols = ['Date', 'Regione', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri', 'totale_casi', 'deceduti', 'totale_casi_giornalieri', 'terapia_intensiva_giornalieri']

    df_covid.data = pd.to_datetime(df_covid.data)
    df_covid.rename(columns={'data': 'Date'}, inplace=True)
    df_covid = sumRegions(df_covid)
    regions_covid = df_covid['Regione'].unique()
    #df_covid = df_covid[df_covid['Regione'].isin(regions)].dropna()
    df_covid.to_csv(path_covid)
else:
    try:
        del df_covid
    except:
        print("No df covid")

No df covid


### Compute Rt

In [62]:
dfs = []
path_covid_predictions="{}predictions/covid.pkl".format(saved)
if recompute_rt:
    for r in regions:
        print("REGIONE: {}".format(r))
        current_df = df_covid.loc[df_covid['Regione'] == r]
        current_df['Date'] = pd.to_datetime(current_df['Date']).dt.date
        current_df['DateIndex'] = current_df.loc[:, 'Date']
        current_df.set_index('DateIndex', inplace=True)
        #current_df = current_df.loc[current_df['nuovi_positivi'] > 0]
        current_df = current_df.loc[pd.to_datetime('2020/03/01'):pd.to_datetime('2021/01/31')]
        idxs = (current_df['nuovi_positivi'] < 0)# | (current_df.isna()) | (current_df['nuovi_positivi'] == np.inf) | (current_df['nuovi_positivi'] == -np.inf)
        if idxs.sum() > 0:
            current_df.loc[idxs, 'nuovi_positivi'] = np.nan
        current_df.fillna(method='ffill', inplace=True)
        current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #current_df[current_df.loc[:, 'nuovi_positivi']]
        #current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #
        current_df = current_df.drop_duplicates(keep='first')
        #print(current_df['nuovi_positivi'].shape, current_df['nuovi_positivi'].apply(lambda x: x < 0).sum())
        #current_df.dropna(subset=['totale_casi_giornalieri'], inplace=True)
        #print(current_df['totale_casi_giornalieri'].isna().sum())
        #print(current_df['totale_casi_giornalieri'].sum())
        r_t_series = covid19.r_covid(current_df['nuovi_positivi'])
        current_df = pd.merge(current_df, r_t_series, left_index=True, right_index=True)
        dfs.append(current_df)
    df_covid_predictions = pd.concat(dfs)
    del dfs
    df_covid_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_covid_predictions['%pos'] = (df_covid_predictions['nuovi_positivi']/df_covid_predictions['tamponi_giornalieri'])
    df_covid_predictions.to_pickle(path_covid_predictions)
else:
    df_covid_predictions = pd.read_pickle(path_covid_predictions)

## Handle traffic data

### Import

In [262]:
import_traffic = False
recompute_kalman = False
path_traffic = "{}traffic.csv".format(saved)
if import_traffic:
    df_traffic_daily = pd.read_pickle(region_traffic_daily)
    df_traffic_daily.loc[df_traffic_daily['Regione'] == "Emilia Romagna", "Regione"] = "Emilia-Romagna"
    df_traffic_predictions = df_traffic_daily.loc[df_traffic_daily['Regione'].isin(regions)]
    df_traffic_predictions = df_traffic_predictions.groupby('Regione').resample('D', on='Date').sum().reset_index()
    df_traffic_predictions = df_traffic_predictions.replace({'0':np.nan, 0:np.nan})
    df_traffic_predictions = df_traffic_predictions.fillna(method='ffill')
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_traffic_predictions.to_csv(path_traffic)
else:
    df_traffic_predictions = pd.read_csv(path_traffic)
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)

### Smoothen with Kalman filter

In [263]:
# apply Kalman Filter to traffic prediction
dict_kalman = {}
path_traffic_predictions="{}predictions/traffic.pkl".format(saved)
if recompute_kalman:
    for trafficKPI in trafficKPIsPrecompute:
        #current_df_kalman = pd.DataFrame({"{}_smoothened".format(trafficKPI): []})
        dfs_current_kpi = []
        for region in regions:
            kf = KalmanFilter(transition_matrices = [1],
                      observation_matrices = [1],
                      initial_state_mean = 0,
                      initial_state_covariance = 1,
                      observation_covariance=1,
                      transition_covariance=.05)

            series = df_traffic_predictions.xs(region, level=1)[trafficKPI]

            kf = kf.em(series)
            (smoothened, smoothed_state_covariances) = kf.smooth(series)
            df_region_kpi = pd.DataFrame({"noisy": series})
            df_region_kpi['smooth'] = smoothened.squeeze()
            df_region_kpi['Regione'] = region
            df_region_kpi.reset_index(inplace=True)
            df_region_kpi.set_index(['Date', 'Regione'], inplace=True)
            dfs_current_kpi.append(df_region_kpi)

            dict_kalman["{}_{}".format(trafficKPI, region)] = kf

        df_traffic_predictions["{}_smoothened".format(trafficKPI)] = pd.concat(dfs_current_kpi)['smooth']
        df_traffic_predictions.to_pickle(path_traffic_predictions)
else:
    df_traffic_predictions = pd.read_pickle(path_traffic_predictions)

# Forecasting

## Define KPIs

In [309]:
trafficKPIs = [col for col in df_traffic_predictions.columns if "smooth" in col]
covidKPIs = []#[col for col in df_covid_predictions.columns if "mean" in col]
temperatureKPIs = []#[col for col in df_temperature.columns if "min" in col]
targetCovid = ['R_mean']

In [310]:
def build_df_prediction(range_dates, min_farsightness, farsightness, delta_features, step_target):
    lags = range(delta_features)
    lags_target = range(min_farsightness, farsightness, step_target)
    all_dfs = []
    # prima ondata
    for region in regions_to_train:
        # filter ts by region
        df_traffic_ts = df_traffic_predictions.loc[(df_traffic_predictions.index.get_level_values(1)==region), trafficKPIs].copy()
        df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region, list(set(covidKPIs+targetCovid))].copy()
        df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region, temperatureKPIs].copy()
        
        df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
        df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))

        df_covid_ts.reset_index(inplace=True)
        df_temperature_ts.reset_index(inplace=True)
        df_traffic_ts.reset_index(inplace=True)

        df_covid_ts = df_covid_ts.set_index('Date')
        df_temperature_ts = df_temperature_ts.set_index('Date')
        df_traffic_ts = df_traffic_ts.set_index('Date')

        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = targetCovid[0]
        df_target_ts = df_covid_ts.copy()

        train_dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)

        train_dates = []
        if len(temperatureKPIs) > 0:
            train_dates_intersection = train_dates_intersection.isin(df_temperature_ts.index)
        for date_val in train_dates_intersection:
            if any(date_val in x for x in range_dates):
                train_dates.append(date_val)

        train_dates = pd.to_datetime(train_dates)

        df_covid_ts, df_traffic_ts, df_temperature_ts = df_covid_ts.loc[df_covid_ts.index.isin(train_dates)], df_traffic_ts.loc[df_traffic_ts.index.isin(train_dates)], df_temperature_ts.loc[df_temperature_ts.index.isin(train_dates)]

        for lag in lags_target:
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_target_ts.copy().shift(-1*lag)[target_col]
        
        # use also today feature
        use_today_feature = True
        if use_today_feature:
            if len(trafficKPIs) > 0:
                df_ts[trafficKPIs] = df_traffic_ts[trafficKPIs].copy()
                features += trafficKPIs
            if len(covidKPIs) > 0:
                df_ts[covidKPIs] = df_covid_ts[covidKPIs].copy()
                features += covidKPIs
        
        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                #print(feature, df_traffic_ts.shift(lag_shift).loc[:, col])
                df_ts[feature] = df_traffic_ts.copy().shift(lag_shift).loc[:, col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.copy().shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.copy().shift(lag_shift)[col]
                features.append(feature)

        df_ts = df_ts[targets+features]
        df_ts.dropna(subset=features, inplace=True)
        df_ts['Regione'] = region
        df_ts = df_ts.reset_index().set_index(['Date', 'Regione'])
        all_dfs.append(df_ts.copy())
    return pd.concat(all_dfs), targets, features, lags, lags_target

In [330]:
divider_po, divider_so = pd.to_datetime('2020-07-01'), pd.to_datetime('2020-10-01')
start_train_po, end_train_so = pd.to_datetime('2020-03-01'), pd.to_datetime('2020-10-15')
start_po = start_train_po
ranges_train_PO = [pd.date_range(start_po, divider_po)]
ranges_train_SO = [pd.date_range(divider_so, end_train_so)]
last_date = min(df_traffic_predictions.index.get_level_values(0).max(), df_covid_predictions.index.get_level_values(0).max(), df_temperature.index.get_level_values(0).max())
ranges_so = [pd.date_range(divider_so, last_date)]
regions_to_train = regions

min_farsightness = 1
farsightness = 20
delta_features = 7
step_target = 5

ranges_test_SO = [pd.date_range(end_train_so-pd.Timedelta(days=delta_features), last_date)]

(df_train_prediction_PO, targets, features, lags, lags_target), (df_train_prediction_SO, _, _, _, _) = \
    build_df_prediction(ranges_train_PO, min_farsightness, farsightness, delta_features, step_target), \
    build_df_prediction(ranges_train_SO, min_farsightness, farsightness, delta_features, step_target)

(df_test_prediction, _, _, _, _) = build_df_prediction(ranges_test_SO, min_farsightness, farsightness, delta_features, step_target)
df_train_prediction = pd.concat([df_train_prediction_PO, df_train_prediction_SO])

In [331]:
params = {'objective': 'reg:squarederror'}

is_xgb = False
is_polynomial = False
is_lstm = True

features_without_lag = trafficKPIs+covidKPIs+temperatureKPIs
num_features_t = len(features_without_lag)
n_timesteps = len(lags)+1

def build_lstm_3(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(1))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(32, activation='relu')))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mse', optimizer='adam')
    return model

models_constructors = {
    "Poly2": partial(make_pipeline, PolynomialFeatures(2), Ridge()),
    "Poly3": partial(make_pipeline, PolynomialFeatures(3), Ridge()),
    "RandomForest": RandomForestRegressor,
    "XGBoost": XGBRegressor,
    "EncDecLSTM": partial(build_lstm_3, n_timesteps, num_features_t)
}

def train_in_interval(interval):
    models_regions = {m: {r:{} for r in regions_to_train} for m in models_constructors.keys()}
    for region in regions_to_train:
        df_ts = df_train_prediction.loc[df_train_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        for lag in lags_target:
            target = "target_{}".format(lag)
            df_ts_lag = df_ts.copy().drop(columns=[col for col in targets if (col not in features) and (col != target)]).dropna()
            for model_name in models_constructors.keys():
                print("{} for {} with lag {}: {} -> {}".format(model_name, region, lag, min(interval), max(interval)))
                is_lstm = "lstm" in model_name.lower()
                model = models_constructors[model_name]()
                if is_lstm:
                    #model = models_constructors[model_name](n_timesteps, num_features_t)
                    callback = EarlyStopping(monitor='loss', patience=3)
                    # N x T x F
                    lstm_input = df_ts_lag[features].to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
                    model.fit(lstm_input, df_ts_lag[target], epochs=100, batch_size=32, verbose=0, callbacks=[callback])
                    models_regions[model_name][region][lag] = model
                else:
                    #model = models_constructors[model_name]()
                    model.fit(df_ts_lag[features].values, df_ts_lag[target])
                    models_regions[model_name][region][lag] = model
    return models_regions

do_train = True

if do_train:

    #models_regions = train_in_interval((start_train_po, divider_po), init_from_scratch=True)
    models_regions = train_in_interval((start_train_po, end_train_so))

Poly2 for Trentino-Alto Adige with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Trentino-Alto Adige with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Trentino-Alto Adige with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Trentino-Alto Adige with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Trentino-Alto Adige with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Trentino-Alto Adige with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Trentino-Alto Adige with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Trentino-Alto Adige with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Trentino-Alto Adige with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Trentino-Alto Adige with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Trentino-Alto Adige with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Trentino-Alto Adige with l

XGBoost for Abruzzo with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Abruzzo with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Abruzzo with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Abruzzo with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Abruzzo with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Abruzzo with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Abruzzo with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Abruzzo with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Abruzzo with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Abruzzo with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Abruzzo with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Abruzzo with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Abruzzo with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for 

Poly2 for Liguria with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Liguria with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Liguria with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Liguria with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Liguria with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Liguria with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Liguria with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Liguria with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Liguria with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Liguria with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Friuli Venezia Giulia with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Friuli Venezia Giulia with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Friuli Venezia Giulia with lag 1: 2020-03

XGBoost for Sardegna with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Sardegna with lag 11: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Sardegna with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Sardegna with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Sardegna with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Sardegna with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Sardegna with lag 16: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Lazio with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Lazio with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
RandomForest for Lazio with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
XGBoost for Lazio with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
EncDecLSTM for Lazio with lag 1: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly2 for Lazio with lag 6: 2020-03-01 00:00:00 -> 2020-10-15 00:00:00
Poly3 for Lazi

In [332]:
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}

def test_in_interval(interval):
    print("TEST START")
                
    df_results = pd.DataFrame() # lag, region, prediction, target; index = date
    results_dict = []

    for region in regions_to_train:
        df_ts = df_test_prediction.loc[df_test_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        first_date, last_date = df_ts.index.min(), df_ts.index.max()
        current_region_values = []
        
        test_dates_region[region] = pd.date_range(first_date, last_date)
        df_ts_test_region[region] = df_ts
        test_dates = test_dates_region[region].unique()
        assert test_dates_region[region].shape[0]==len(test_dates), "Something wrong"
        for idx_lag, lag in enumerate(lags_target):
            for model_name in models_constructors.keys():
                target_col = targets[idx_lag]
                #walk_forward_df = df_train_prediction.copy().drop(columns=[col for col in targets if (col not in features) and (col != target_col)]).dropna()
                
                current_df_train_prediction = df_train_prediction.copy().xs(region, level='Regione')
                walk_forward_df = current_df_train_prediction.drop(columns=[col for col in targets if (col not in features) and (col != target_col)]).dropna()
            
                current_df_ts = df_ts[features+[target_col]].copy().dropna()
                test_dates = current_df_ts.index
                print("{} for {} with lag = {}: {} -> {}, {}".format(model_name, region, lag, min(test_dates), max(test_dates), current_df_ts.shape))
                for i, t in enumerate(test_dates):
                    current_df_ts = df_ts.loc[t:t+datetime.timedelta(days=0)]
                    X_test_ts = current_df_ts[features]
                    
                    is_lstm = "lstm" in model_name.lower()
                    
                    if is_lstm:
                        lstm_input = X_test_ts.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
                        predictions = models_regions[model_name][region][lag].predict(lstm_input).flatten()
                    else:
                        predictions = models_regions[model_name][region][lag].predict(X_test_ts.values)

                    X_test_fit, y_test_fit = X_test_ts, current_df_ts[target_col].values

                    #current_result = {"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit}
                    current_result = {"model": model_name,"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": predictions[0], "target": y_test_fit[0]}
                    results_dict.append(current_result)

                    walk_forward_df = walk_forward_df.append(current_df_ts)
                    walk_forward_df = walk_forward_df.sort_index()
                    window_train = 45
                    if is_lstm:
                        callback = EarlyStopping(monitor='loss', patience=3)
                        window_train = 32
                        # I keep the last model and update weights using a window (last 30 values)
                        model = models_regions[model_name][region][lag]
                        lstm_input = walk_forward_df[features].to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
                        lstm_input = lstm_input[-1*window_train:]
                        lstm_target = walk_forward_df[target_col].values[-1*window_train:]
                        model.fit(lstm_input, lstm_target, epochs=10, batch_size=8, verbose=0, callbacks=[callback])
                        models_regions[model_name][region][lag] = model
                    else:
                        use_window = True
                        if use_window:
                            window_walk_forward_df = walk_forward_df.iloc[-1*window_train:]
                            features_vals, targets_vals = window_walk_forward_df[features], window_walk_forward_df[target_col]
                            #features_vals, targets_vals = walk_forward_df[features].iloc[-1*window_train:], walk_forward_df[target_col].iloc[-1*window_train:]
                            #print(features_vals.shape, targets_vals.shape)
                            models_regions[model_name][region][lag] = models_constructors[model_name]()
                            models_regions[model_name][region][lag].fit(features_vals.values, targets_vals)
                        else:
                            # here I should do the same as before, for now not trying because I suppose it's better (to more enphasize recent behaviour)
                            pass
    df_results = pd.DataFrame(results_dict)
    df_results = df_results.dropna()
    df_results.set_index(['model', 'date', 'region', 'lag'], inplace=True)
    df_results['error']=(df_results['prediction']-df_results['target']).abs()
    df_results['error_2'] = df_results['error']**2
    return models_regions, df_results

min_date, max_date = df_test_prediction.index.get_level_values(0).min(), df_test_prediction.index.get_level_values(0).max()
if do_train:
    models_regions, df_results = test_in_interval((min_date, max_date))
else:
    path_results = "{}predictions/results_v3.csv".format(saved)
    df_results = pd.read_csv(path_results)
    df_results['date'] = pd.to_datetime(df_results['date'])
    df_results.set_index(['model', 'date', 'region', 'lag'], inplace=True)

TEST START
Poly2 for Trentino-Alto Adige with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
Poly3 for Trentino-Alto Adige with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
RandomForest for Trentino-Alto Adige with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
XGBoost for Trentino-Alto Adige with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
EncDecLSTM for Trentino-Alto Adige with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
Poly2 for Trentino-Alto Adige with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly3 for Trentino-Alto Adige with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
RandomForest for Trentino-Alto Adige with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
XGBoost for Trentino-Alto Adige with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
EncDecLSTM for Trentino-Alto Adige with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, 

EncDecLSTM for Lombardia with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly2 for Lombardia with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly3 for Lombardia with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
RandomForest for Lombardia with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
XGBoost for Lombardia with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
EncDecLSTM for Lombardia with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly2 for Lombardia with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
Poly3 for Lombardia with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
RandomForest for Lombardia with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
XGBoost for Lombardia with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
EncDecLSTM for Lombardia with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)


Poly3 for Calabria with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
RandomForest for Calabria with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
XGBoost for Calabria with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
EncDecLSTM for Calabria with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
Poly2 for Calabria with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly3 for Calabria with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
RandomForest for Calabria with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
XGBoost for Calabria with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
EncDecLSTM for Calabria with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly2 for Calabria with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly3 for Calabria with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
RandomForest for Calabri

Poly3 for Marche with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
RandomForest for Marche with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
XGBoost for Marche with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
EncDecLSTM for Marche with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly2 for Marche with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
Poly3 for Marche with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
RandomForest for Marche with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
XGBoost for Marche with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
EncDecLSTM for Marche with lag = 16: 2020-10-15 00:00:00 -> 2021-01-05 00:00:00, (83, 33)
Poly2 for Basilicata with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
Poly3 for Basilicata with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
RandomForest for Basilicata wit

EncDecLSTM for Piemonte with lag = 1: 2020-10-15 00:00:00 -> 2021-01-20 00:00:00, (98, 33)
Poly2 for Piemonte with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly3 for Piemonte with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
RandomForest for Piemonte with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
XGBoost for Piemonte with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
EncDecLSTM for Piemonte with lag = 6: 2020-10-15 00:00:00 -> 2021-01-15 00:00:00, (93, 33)
Poly2 for Piemonte with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly3 for Piemonte with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
RandomForest for Piemonte with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
XGBoost for Piemonte with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
EncDecLSTM for Piemonte with lag = 11: 2020-10-15 00:00:00 -> 2021-01-10 00:00:00, (88, 33)
Poly2 for Piemon

In [333]:
def build_df_results_groupped(df_results, col_prediction = 'prediction', col_error = 'error_2', col_rmse = 'rmse', col_r2 = 'r2'):
    try:
        groupped_df = df_results.groupby(level=['model', 'region', 'lag'])
    except:
        print("WARNING: not groupped")
        groupped_df = df_results
    df = pd.DataFrame()
    df[col_rmse] = np.sqrt(groupped_df[col_error].mean())
    df[col_r2]=groupped_df.apply(lambda g: r2_score( g[col_prediction], g['target'] ))
    return df[[col_rmse, col_r2]]

df_results_mean = build_df_results_groupped(df_results)

In [334]:
df_results_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rmse,r2
model,region,lag,Unnamed: 3_level_1,Unnamed: 4_level_1
EncDecLSTM,Abruzzo,1,0.082081,0.738168
EncDecLSTM,Abruzzo,6,0.069926,0.729615
EncDecLSTM,Abruzzo,11,0.060378,0.822002
EncDecLSTM,Abruzzo,16,0.075288,0.713871
EncDecLSTM,Basilicata,1,0.115593,0.758348
...,...,...,...,...
XGBoost,Valle d'Aosta,16,0.123522,0.189415
XGBoost,Veneto,1,0.067896,0.836306
XGBoost,Veneto,6,0.055720,0.873300
XGBoost,Veneto,11,0.035857,0.939332


In [335]:
#df_results_smoothened = df_results.xs('Lombardia', level=1).xs(17, level=1).copy()
covariance_alternatives = [.001, .005, .0005]

def smooth(series, transition_covariance = .0005):
    kf = KalmanFilter(transition_matrices = [1],
                          observation_matrices = [1],
                          initial_state_mean = 0,
                          initial_state_covariance = 1,
                          observation_covariance=1,
                          transition_covariance=transition_covariance)
    kf = kf.em(series)
    (smoothened, smoothed_state_covariances) = kf.smooth(series)
    return smoothened.squeeze()

groupped_df = df_results['prediction'].groupby(level=['model', 'region', 'lag'])
for cov in covariance_alternatives:
    prefix = "kf_{}_".format(cov)
    if do_train:
        df_results['{}prediction'.format(prefix)] = groupped_df.transform(lambda x: smooth(x, cov))
        df_results['{}error'.format(prefix)]=(df_results['{}prediction'.format(prefix)]-df_results['target']).abs()
        df_results['{}error_2'.format(prefix)] = df_results['{}error'.format(prefix)]**2
    df_results_mean = pd.concat([df_results_mean, \
                        build_df_results_groupped(df_results, '{}prediction'.format(prefix), '{}error_2'.format(prefix), '{}rmse'.format(prefix), '{}r2'.format(prefix) )], axis=1)

In [336]:
name_current_save = 'results_v7_no_traffic' # non cambia, ma devo guardare come retraina LSTM
path_results_to_save = "{}predictions/{}".format(saved, name_current_save)
path_results_to_mean_save = "{}_mean".format(path_results_to_save)
path_results_to_cov_save = "{}_covariances".format(path_results_to_save)
df_results.to_csv("{}.csv".format(path_results_to_save))
print("OK SAVED!!!")

OK SAVED!!!


In [337]:
best_covariance = {}

for region in regions_to_train:
    for lag in lags_target:
        for model_name in models_constructors.keys():
            current_df = df_results_mean.xs(model_name, level=0).xs(region, level=0).loc[lag]
            errors = []
            for cov in covariance_alternatives:
                prefix = "kf_{}_".format(cov)
                err_2 = current_df['{}rmse'.format(prefix)].mean()
                errors.append(err_2)
            argmin_err = np.argmin(errors)
            best_covariance["{}_{}_{}".format(model_name, region, lag)] = (errors[argmin_err], covariance_alternatives[argmin_err])

In [338]:
best_covariance

{'Poly2_Trentino-Alto Adige_1': (0.06635071378783318, 0.0005),
 'Poly3_Trentino-Alto Adige_1': (0.043732673090005586, 0.0005),
 'RandomForest_Trentino-Alto Adige_1': (0.05327948185663372, 0.005),
 'XGBoost_Trentino-Alto Adige_1': (0.04938979004129613, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_1': (0.06457171519593786, 0.0005),
 'Poly2_Trentino-Alto Adige_6': (0.053782289446206784, 0.0005),
 'Poly3_Trentino-Alto Adige_6': (0.04170944310661626, 0.0005),
 'RandomForest_Trentino-Alto Adige_6': (0.0611451817635061, 0.005),
 'XGBoost_Trentino-Alto Adige_6': (0.05884388955557111, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_6': (0.07081708197028798, 0.0005),
 'Poly2_Trentino-Alto Adige_11': (0.0586433211936915, 0.005),
 'Poly3_Trentino-Alto Adige_11': (0.036830197971682896, 0.0005),
 'RandomForest_Trentino-Alto Adige_11': (0.09212624242359478, 0.005),
 'XGBoost_Trentino-Alto Adige_11': (0.08330072236321796, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_11': (0.09627926882263493, 0.0005),
 'Poly2_Tre

In [339]:
import pickle
with open('{}.pkl'.format(path_results_to_cov_save), 'wb') as handle:
    pickle.dump(best_covariance, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [340]:
values_best = []
for region in regions_to_train:
    for lag in lags_target:
        for model_name in models_constructors.keys():
            current_df = df_results.xs(model_name, level=0).xs(region, level=1).xs(lag, level=1)
            cov = best_covariance["{}_{}_{}".format(model_name, region, lag)][1]

            prefix = "kf_{}_".format(cov)

            best_prediction_values = current_df["{}prediction".format(prefix)]
            rmse_val, r2_val = np.sqrt(MSE(best_prediction_values, current_df['target'])), r2_score(best_prediction_values, current_df['target'])
            values_best.append({'model': model_name, 'region': region, 'lag': lag, 'best_r2': r2_val, 'best_rmse': rmse_val})

df_best = pd.DataFrame(values_best)
df_best.set_index(['model', 'region', 'lag'], inplace=True)
try:
    df_results_mean = df_results_mean.drop(columns=['best_r2', 'best_rmse'])
except:
    pass
df_results_mean = pd.concat([df_results_mean, df_best], axis=1)

In [341]:
df_results_mean.to_csv("{}.csv".format(path_results_to_mean_save))

In [342]:
best_covariance

{'Poly2_Trentino-Alto Adige_1': (0.06635071378783318, 0.0005),
 'Poly3_Trentino-Alto Adige_1': (0.043732673090005586, 0.0005),
 'RandomForest_Trentino-Alto Adige_1': (0.05327948185663372, 0.005),
 'XGBoost_Trentino-Alto Adige_1': (0.04938979004129613, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_1': (0.06457171519593786, 0.0005),
 'Poly2_Trentino-Alto Adige_6': (0.053782289446206784, 0.0005),
 'Poly3_Trentino-Alto Adige_6': (0.04170944310661626, 0.0005),
 'RandomForest_Trentino-Alto Adige_6': (0.0611451817635061, 0.005),
 'XGBoost_Trentino-Alto Adige_6': (0.05884388955557111, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_6': (0.07081708197028798, 0.0005),
 'Poly2_Trentino-Alto Adige_11': (0.0586433211936915, 0.005),
 'Poly3_Trentino-Alto Adige_11': (0.036830197971682896, 0.0005),
 'RandomForest_Trentino-Alto Adige_11': (0.09212624242359478, 0.005),
 'XGBoost_Trentino-Alto Adige_11': (0.08330072236321796, 0.0005),
 'EncDecLSTM_Trentino-Alto Adige_11': (0.09627926882263493, 0.0005),
 'Poly2_Tre

In [343]:
name = "ModelValidation"
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app_validation_models = JupyterDash(name)

app_validation_models.layout = html.Div([
html.Label(
    [
        "Model",
        dcc.Dropdown(id="models",
                     options=[{"label": x, "value": x} for x in models_constructors.keys()],
                    value=[list(models_constructors.keys())[0]],
                    multi=True,
                    clearable=True)
    ]),
html.Label(
    [
        "Regione",
        dcc.Dropdown(id="regions",
                     options=[{"label": x, "value": x} for x in regions_to_train],
                    value=[regions_to_train[0]],
                    multi=True,
                    clearable=True)
    ]),
html.Label(
    [
        "Apply kalman in output",
        dcc.Dropdown(id="do_kalman",
                     options=[{"label": x, "value": x} for x in ["Yes", "No"]],
                    value="No")
    ]),

html.Label(
    [
        "Lag",
        dcc.Dropdown(id="lags",
                     options=[{"label": x, "value": x} for x in lags_target],
                    value=lags_target[-1],
                    clearable=True)
    ]),
html.Div(dcc.Graph(id=name))])

@app_validation_models.callback(
Output(name, "figure"), 
[Input("models", "value"), Input("regions", "value"), Input("lags", "value"), Input("do_kalman", "value")])
def display_model_validation(models, regions, lag, do_kalman):
    
    if isinstance(regions, str):
        regions = [regions]

    if regions is None:
        regions = [regions_to_train[0]]
        
    if isinstance(models, str):
        models = [models]

    if models is None:
        models = [models[0]]
        
    do_kalman = do_kalman=="Yes"

    R_colors = px.colors.qualitative.Dark24#['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742', '#F6DE43', '#EF42D8', '#FA5F8D', '#D0114A']
    y_hat_colors = px.colors.qualitative.Alphabet#['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0', '#00F572', '#01CA5F', '#02DEB2', '#011ABC']
    fig = make_subplots(specs=[[{"secondary_y": False}]])
    #df_plot = df_results.reset_index()
    df_plot = df_results.loc[(df_results.index.get_level_values('model').isin(models))&(df_results.index.get_level_values('region').isin(regions))&(df_results.index.get_level_values('lag')==lag)]
    #df_plot = df_plot.reset_index().set_index('date')
    
    delta = .05
    col_prediction = "prediction"# if not do_kalman else "kf_prediction"
    col_error = "error"# if not do_kalman else "kf_error"
    
    
    #min_y, max_y = min(df_plot[col_prediction].min()-delta, df_plot['target'].min()-delta), max(df_plot[col_prediction].max()+delta, df_plot['target'].max()+delta)
    for i, r in enumerate(regions):
        for j, m in enumerate(models):
            cov = best_covariance["{}_{}_{}".format(m, region, lag)][1]

            prefix = "kf_{}_".format(cov)

            col_prediction = "prediction" if not do_kalman else "{}prediction".format(prefix)

            current_df_results = df_plot.loc[(m, slice(None), r, lag)]
            current_df_results = current_df_results.reset_index().set_index('date')
            predictions = current_df_results[col_prediction]
            targets = current_df_results['target'].loc[predictions.index]
            fig.add_trace(
                        go.Scatter(
                            x=predictions.index,
                            y=predictions,
                            name="Prediction - {}".format(r),
                            marker=dict(
                                color=y_hat_colors[i]
                            )
                        )
                    )
            fig.add_trace(
                    go.Scatter(
                        x=targets.index,
                        y=targets,
                        name="Target - {}".format(r),
                        marker=dict(
                            color=R_colors[i]
                        )
                    )
                )
    
    fig.update_layout(
        title_text="LALA"
    )
    '''
    
    
    for dateRange in ranges_train:
        fig = fill_with_areas(dateRange, fig, True)
    '''
    #for dateRange in ranges_test:
    #    fig = fill_with_areas(dateRange, fig, False)
    #fig.update_layout(yaxis_range=[min_y, max_y])
    return fig

#app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
app_validation_models.run_server(mode='inline', port=21456)

In [212]:
# TODO sto trainando corretto? perchè shifta in avanti la parte sinistra, dovrebbe essere che si toglie quella a destra...

In [325]:
subplot_titles = []
models_rename = {'Poly2': 'Poly2','Poly3': 'Poly3','RandomForest': 'RF','XGBoost': 'XGB','EncDecLSTM': 'DL'}
for idx_model, model_name in enumerate(models_constructors.keys()):
    for idx_lag, lag in enumerate(lags_target):
        subplot_titles.append("{}, {}".format(models_rename[model_name], lag))
fig = make_subplots(rows=len(models_constructors), cols=len(lags_target), shared_xaxes=True, shared_yaxes=True, subplot_titles=subplot_titles)
regions_to_sample = df_results.index.get_level_values('region').unique()
random_region = regions_to_sample[random.randint(0,len(regions_to_sample))]
for idx_lag, lag in enumerate(lags_target):
    for idx_model, model_name in enumerate(models_constructors.keys()):
        current_df_results = df_results.xs(model_name, level=0).xs(random_region, level=1).xs(lag, level=1).reset_index().set_index('date')
        idxs = current_df_results.index
        target_vals, predictions_vals = current_df_results['target'], current_df_results['prediction']
        show_legend = idx_lag == 0 and idx_model == 0
        fig.add_trace(
            go.Scatter(x=idxs, y=target_vals,
                name="Target",
                marker=dict(
                    color='#FF0000'
                ), 
                showlegend=show_legend
            ),
            row=idx_model+1, col=idx_lag+1
        )
        fig.add_trace(
            go.Scatter(x=idxs, y=predictions_vals,
                name="Prediction",
                marker=dict(
                    color='#0000FF'
                ),
                showlegend=show_legend
            ),
            row=idx_model+1, col=idx_lag+1
        )
fig.update_yaxes(range=[0.5, 1.5])
fig.update_layout(height=800, width=1200)
fig.show()

In [326]:
subplot_titles = []
models_rename = {'Poly2': 'Poly2','Poly3': 'Poly3','RandomForest': 'RF','XGBoost': 'XGB','EncDecLSTM': 'DL'}
for idx_model, model_name in enumerate(models_constructors.keys()):
    for idx_lag, lag in enumerate(lags_target):
        subplot_titles.append("{}, {}".format(models_rename[model_name], lag))
fig = make_subplots(rows=len(models_constructors), cols=len(lags_target), shared_xaxes=True, shared_yaxes=True, subplot_titles=subplot_titles)

for idx_lag, lag in enumerate(lags_target):
    for idx_model, model_name in enumerate(models_constructors.keys()):
        
        cov = best_covariance["{}_{}_{}".format(model_name, region, lag)][1]
        prefix = "kf_{}_".format(cov)
        col_prediction = "{}prediction".format(prefix)
        
        current_df_results = df_results.xs(model_name, level=0).xs(random_region, level=1).xs(lag, level=1).reset_index().set_index('date')
        idxs = current_df_results.index
        target_vals, predictions_vals = current_df_results['target'], current_df_results[col_prediction]
        show_legend = idx_lag == 0 and idx_model == 0
        
        fig.add_trace(
            go.Scatter(x=idxs, y=target_vals,
                name="Target",
                marker=dict(
                    color='#FF0000'
                ), 
                showlegend=show_legend
            ),
            row=idx_model+1, col=idx_lag+1
        )
        fig.add_trace(
            go.Scatter(x=idxs, y=predictions_vals,
                name="Prediction",
                marker=dict(
                    color='#0000FF'
                ),
                showlegend=show_legend
            ),
            row=idx_model+1, col=idx_lag+1
        )
fig.update_yaxes(range=[0.5, 1.5])
fig.update_layout(height=800, width=1200)
fig.show()

In [327]:
# scatterplot of rmse
subplot_titles = [m for m in models_constructors.keys()]
#fig = make_subplots(specs=[{"secondary_y": True}], rows=len(models_constructors), shared_xaxes=True, shared_yaxes=True, subplot_titles=subplot_titles)
#fig=make_subplots(rows=len(models_constructors), shared_xaxes=True, shared_yaxes=True, subplot_titles=subplot_titles, specs=[[{"secondary_y": True}]])

specs=[[{"secondary_y": True}], [{"secondary_y": True}], [{"secondary_y": True}], [{"secondary_y": True}], [{"secondary_y": True}]]

fig = make_subplots(specs=specs, rows=len(models_constructors), subplot_titles=subplot_titles)
for idx_model, model_name in enumerate(models_constructors):
    df_plot = df_results_mean.xs(model_name, level=0).xs(random_region, level=0).reset_index().set_index('lag')
    col_rmse, col_r2 = 'rmse', 'r2'
    rmse_vals, r2_vals = df_plot[col_rmse], df_plot[col_r2]
    
    fig.add_trace(go.Scatter(x=rmse_vals.index, y=rmse_vals, name="{} RMSE".format(model_name)),
        row=idx_model+1, col=1)
    
    fig.add_trace(
            go.Scatter(
            x=r2_vals.index, y=r2_vals, name="{} R2".format(model_name)
        ),
        secondary_y=True,
        row=idx_model+1, col=1
    )

fig.update_yaxes(range=[df_results_mean['r2'].min()-.2, df_results_mean['r2'].max()+.2])
fig.update_layout(height=1000, width=800)

fig.show()

In [328]:
df_results_best_model, df_results_mean_best_model = df_results.xs('EncDecLSTM', level=0), df_results_mean.xs('EncDecLSTM', level=0)

In [329]:
col = 'r2'
do_kalman = True

if do_kalman:
    col = 'best_{}'.format(col)

df_plot = df_results_mean_best_model.reset_index()

fig = px.box(df_plot, x="lag", y=col)
means = df_plot.groupby(by=["lag"]).mean()[col]
medians = df_plot.groupby(by=["lag"]).median()[col]

fig.add_trace(go.Scatter(x=means.index, y=means,
                mode='lines',
                name='Mean'))

# Add figure title
fig.update_layout(
    title_text="Boxplot errors",
    yaxis=dict(
        range=[0, df_plot.max()]
    )
)

fig.show()

In [222]:
df_results_best_model, df_results_mean_best_model = df_results.xs('Poly3', level=0), df_results_mean.xs('Poly3', level=0)

In [225]:
col = 'r2'
do_kalman = False

if do_kalman:
    col = 'best_{}'.format(col)

df_plot = df_results_mean_best_model.reset_index()

fig = px.box(df_plot, x="lag", y=col)
means = df_plot.groupby(by=["lag"]).mean()[col]
medians = df_plot.groupby(by=["lag"]).median()[col]

fig.add_trace(go.Scatter(x=means.index, y=means,
                mode='lines',
                name='Mean'))

# Add figure title
fig.update_layout(
    title_text="Boxplot errors",
    yaxis=dict(
        range=[0, df_plot.max()]
    )
)

fig.show()

In [None]:
df_traffic_lazio = df_traffic_predictions.xs('Lazio', level='Regione')
df_covid_lazio = df_covid_predictions.xs('Lazio', level='Regione')
df_test_lazio = df_test_prediction.xs('Lazio', level='Regione')

In [None]:
df_covid_lazio['R_mean'].loc[pd.to_datetime('2020-10-15'):].plot()
df_test_lazio['R_mean_7'].plot()
plt.legend()

In [None]:
df_test_lazio['R_mean_1'].plot()
df_test_lazio['target_1'].plot()
plt.legend()

In [None]:
df_test_lazio['target_50'].plot()
df_test_lazio['target_1'].plot()

In [None]:
df_test_lazio['R_mean_1'].plot()

In [131]:
# test: eseguo la test_in_interval per vedere come si riempe il walk_forward_validation dataset

df_results = pd.DataFrame() # lag, region, prediction, target; index = date
results_dict = []
min_date, max_date = df_test_prediction.index.get_level_values(0).min(), df_test_prediction.index.get_level_values(0).max()
interval=(min_date, max_date)

models_to_check = list(models_constructors.keys())[0:2]
regions_to_check = regions_to_train[0:2]

for region in regions_to_train:
    df_ts = df_test_prediction.loc[df_test_prediction.index.get_level_values(1)==region]
    df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
    df_ts = df_ts.loc[(df_ts.index>interval[0])&(df_ts.index < interval[1])]
    first_date, last_date = df_ts.index.min(), df_ts.index.max()
    current_region_values = []

    test_dates_region[region] = pd.date_range(first_date, last_date)
    df_ts_test_region[region] = df_ts
    test_dates = test_dates_region[region].unique()
    assert test_dates_region[region].shape[0]==len(test_dates), "Something wrong"
    for idx_lag, lag in enumerate(lags_target):
        for model_name in models_to_check:
            target_col = targets[idx_lag]
            current_df_train_prediction = df_train_prediction.copy().xs(region, level='Regione')
            walk_forward_df = current_df_train_prediction.drop(columns=[col for col in targets if (col not in features) and (col != target_col)]).dropna()
            current_df_ts = df_ts[features+[target_col]].copy().dropna()
            test_dates = current_df_ts.index
            print("{} for {} with lag = {}: {} -> {}, {}\n\n".format(model_name, region, lag, min(test_dates), max(test_dates), current_df_ts.shape))
            for i, t in enumerate(test_dates):
                features_covid = [col for col in features if "R_mean" in col]
                current_df_ts = df_ts.loc[t:t+datetime.timedelta(days=0)]
                if i % 20 == 0:
                    print("LOG: {}".format(t))
                    min_t, max_t = walk_forward_df.index.min(), walk_forward_df.index.max()
                    feature_val_1, feature_val_2 = walk_forward_df[feature_to_show].loc[min_t], walk_forward_df[feature_to_show].loc[max_t]
                    target_val_1, target_val_2 = walk_forward_df[target_col].loc[min_t], walk_forward_df[target_col].loc[max_t]
                    print("BEFORE: {} -> {}\r\n {} = ({}, {})\r\n{} = ({}, {})".format(min_t, max_t, target_col, target_val_1, target_val_2, feature_to_show, feature_val_1, feature_val_2))
                walk_forward_df = walk_forward_df.append(current_df_ts)
                
                window_train = 60
                first_date = pd.to_datetime(t-datetime.timedelta(days=window_train))
                walk_forward_df = walk_forward_df.sort_index()
                walk_forward_df = walk_forward_df.iloc[-1*window_train:]
                feature_to_show = "R_mean_7"
                min_t, max_t = walk_forward_df.index.min(), walk_forward_df.index.max()
                feature_val_1, feature_val_2 = walk_forward_df[feature_to_show].loc[min_t], walk_forward_df[feature_to_show].loc[max_t]
                target_val_1, target_val_2 = walk_forward_df[target_col].loc[min_t], walk_forward_df[target_col].loc[max_t]
                if i % 20 == 0:
                    print("AFTER: {} -> {}\r\n {} = ({}, {})\r\n{} = ({}, {})".format(min_t, max_t, target_col, target_val_1, target_val_2, feature_to_show, feature_val_1, feature_val_2))
                    print("\r\n")

Poly2 for Trentino-Alto Adige with lag = 1: 2020-10-16 00:00:00 -> 2021-01-20 00:00:00, (97, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-15 00:00:00 -> 2020-10-15 00:00:00
 target_1 = (1.2412813224440011, 1.5422487212948168)
R_mean_7 = (2.25231545586442, 1.423923751971896)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_1 = (1.340527299445097, 1.5067548195845983)
R_mean_7 = (0.5057087246316899, 1.4533125097171518)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_1 = (1.1513905952079424, 1.00638206792585)
R_mean_7 = (0.784494761543447, 1.193740696220011)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_1 = (1.3180765994702326, 0.9854109869150813)
R_mean_7 = (0.7575794240825878, 1.1876416737953353)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_1 = (1.0610049905119108, 0.8573140656634863)
R_mean_7 = (1.6035100459986165, 0.8155735004887105)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 t

LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_1 = (1.144283353885958, 1.0622177909358446)
R_mean_7 = (1.161739181609875, 1.0757155044066131)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_1 = (1.1264364003511926, 1.0407188824263942)
R_mean_7 = (1.1416626757530937, 1.0890460659038022)


Poly2 for Puglia with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-14 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (1.2757380705814039, 1.326373951491473)
R_mean_7 = (2.582892221931861, 1.3366183716811402)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_4 = (0.9071395099425761, 1.3171800532476667)
R_mean_7 = (0.8609494677557437, 1.318933203662381)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (1.1879295806628494, 1.1050512417661147)
R_mean_7 = (0.8448605146030972, 1.1829760666977966)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_4 = 

LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_4 = (1.4299977502423522, 1.0276801155458508)
R_mean_7 = (1.4401360808346626, 0.9761482290887558)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_4 = (1.4169547351851501, 1.013165099291969)
R_mean_7 = (1.4182598062277794, 0.9804139308576396)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (1.0314505076077096, 0.8743269615964759)
R_mean_7 = (1.149588557119114, 1.1084418536796297)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (1.0175066592206115, 0.8568899636557958)
R_mean_7 = (1.1266679219305993, 1.1022037656450023)


Poly3 for Emilia-Romagna with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-13 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (1.255581994212696, 1.4395596597526867)
R_mean_7 = (1.8115129305309206, 1.4635872359842828)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 t

LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_1 = (1.4710942522970485, 1.0826888673336263)
R_mean_7 = (1.790086917933822, 0.9810633544660761)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_1 = (1.4416835880497265, 1.04208925973885)
R_mean_7 = (1.7130637848695967, 1.0103326585320822)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_1 = (0.9404630965493201, 1.0778012924225633)
R_mean_7 = (1.0869291937266883, 1.0743693566785864)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_1 = (0.930912728000687, 1.0159956698221315)
R_mean_7 = (1.0618816282993073, 1.0892693295715108)


Poly3 for Lombardia with lag = 1: 2020-10-16 00:00:00 -> 2021-01-20 00:00:00, (97, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-13 00:00:00 -> 2020-10-15 00:00:00
 target_1 = (1.1497446217077443, 1.5000966293565257)
R_mean_7 = (1.791502430976775, 1.8497219521896002)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_1

LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (1.4139813951927689, 1.0614951400940642)
R_mean_7 = (0.8879670246313287, 1.1197979690391842)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_4 = (1.3934552363837729, 1.0499244087957846)
R_mean_7 = (0.9857554704251056, 1.0947906844823203)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_4 = (2.19800313469079, 0.7650969146296498)
R_mean_7 = (1.2324613007472576, 0.8788327639269852)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_4 = (2.464000991778339, 0.7529592317573357)
R_mean_7 = (1.2354211317586543, 0.8650161455881272)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_4 = (1.3072219018422588, 0.9297200582977272)
R_mean_7 = (1.5453666711472565, 0.8152351018577471)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_4 = (1.2949547719455743, 0.9264271120682497)
R_mean_7 = (1.4887778301914403, 0.834335043

LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (1.0867164658277773, 1.1448377531976246)
R_mean_7 = (1.119255056229799, 1.0150284971125525)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (1.117175863859798, 1.1304106384204173)
R_mean_7 = (1.0883701038583782, 1.0329656622891243)


Poly2 for Sicilia with lag = 1: 2020-10-16 00:00:00 -> 2021-01-20 00:00:00, (97, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-13 00:00:00 -> 2020-10-15 00:00:00
 target_1 = (1.9258179407432432, 1.2845305183499773)
R_mean_7 = (1.9767471468295013, 1.4493821756591814)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_1 = (0.942589433883285, 1.25583839563342)
R_mean_7 = (0.7528799319755999, 1.4366008347803598)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_1 = (1.5443192918553714, 1.1265789217124502)
R_mean_7 = (0.9968911545321939, 1.1772072498022146)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_1 =

Poly2 for Valle d'Aosta with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-17 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (0.8976142284021535, 1.1756141846224328)
R_mean_7 = (2.318098956460344, 1.8334725447003652)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_4 = (1.251698919122785, 1.1438302812570167)
R_mean_7 = (1.3159056494388692, 1.8230759333154871)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (1.658205083356634, 1.009510307927636)
R_mean_7 = (1.8744845855191632, 0.9857375757757928)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_4 = (1.7071263280346063, 0.9859931768869149)
R_mean_7 = (1.926926118475992, 0.9844356073915013)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_4 = (2.2454610563319175, 0.7121235772874793)
R_mean_7 = (2.580857893789517, 0.6869304710133417)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 targe

LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (1.1369124349589266, 1.0172065110952138)
R_mean_7 = (1.1736813525769183, 1.0744564096293583)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (1.1413586319484634, 1.014532902305127)
R_mean_7 = (1.1820032919519026, 1.061406558354666)


Poly3 for Calabria with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-17 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (1.189728991951231, 1.393245850573676)
R_mean_7 = (2.020247184215986, 1.5962969249950878)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_4 = (1.173357442841974, 1.3519443999325391)
R_mean_7 = (1.49969953370191, 1.589043803911625)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (3.0011330186052327, 1.1217595038103991)
R_mean_7 = (2.037694797208828, 1.1671169113162638)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_4 = (2.

LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_4 = (1.1974420400097883, 0.9873055781378482)
R_mean_7 = (1.5070773320092878, 1.0014635700308991)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_4 = (1.1808169158644413, 0.9803641040047943)
R_mean_7 = (1.469324491794937, 1.0248006109068875)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (0.845895895684425, 0.9422611635605544)
R_mean_7 = (1.0251768901289056, 1.1199537127861778)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (0.8236355338809387, 0.9348898043938196)
R_mean_7 = (1.0102861469905338, 1.1170390626936086)


Poly2 for Friuli Venezia Giulia with lag = 1: 2020-10-16 00:00:00 -> 2021-01-20 00:00:00, (97, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-13 00:00:00 -> 2020-10-15 00:00:00
 target_1 = (1.3563355656256975, 1.4407382867794745)
R_mean_7 = (2.42401751313216, 1.3666008963290177)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00

LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_1 = (1.6795278485922793, 0.7349559965460637)
R_mean_7 = (1.8659301979609617, 0.8419314182133724)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_1 = (1.583930522585778, 0.739714122452845)
R_mean_7 = (2.145652232466804, 0.8264910796346314)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_1 = (1.3595744115500905, 0.9589863085973175)
R_mean_7 = (1.4204779133403362, 0.8765846824244857)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_1 = (1.3465100915456016, 0.9592824122316714)
R_mean_7 = (1.4290461474245597, 0.8832678825753938)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_1 = (0.9301448244416353, 1.0836072536784784)
R_mean_7 = (1.1536468144850427, 1.1048784099086266)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_1 = (0.9226673705656927, 1.0479728081925894)
R_mean_7 = (1.1247322644782436, 1.108032947

LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_1 = (1.599450347988657, 0.9910434918611132)
R_mean_7 = (0.7780645126827703, 1.1210146909422514)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_1 = (1.6431471987610264, 0.9852206145812112)
R_mean_7 = (0.875325312743569, 1.08795041220639)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_1 = (1.2829037201625615, 0.9743528260109148)
R_mean_7 = (1.4186115494921732, 0.8191360666929761)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_1 = (1.3069916544775768, 0.9877326541301692)
R_mean_7 = (1.40482252580179, 0.8232172638121816)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_1 = (1.5741654832304914, 1.0696480230344128)
R_mean_7 = (1.4497450121919633, 0.991116112062361)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_1 = (1.5356959898111362, 1.0567749317118706)
R_mean_7 = (1.4240238916262085, 1.0017827346626

Poly2 for Basilicata with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-23 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (0.7784412684100162, 1.3555683480893004)
R_mean_7 = (1.8218552889165465, 1.232057516173037)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_4 = (1.755133808152495, 1.328626831382281)
R_mean_7 = (0.9375984752918406, 1.3515496150807174)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (2.9968844412865727, 1.0626323495216958)
R_mean_7 = (2.042059079163291, 1.3548993312946354)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_4 = (2.9734624579902014, 1.0397777848689391)
R_mean_7 = (2.104935682873928, 1.352486903292859)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_4 = (2.9678849006852057, 0.7202913722734214)
R_mean_7 = (2.5309936778354247, 0.9323596443911263)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_4

LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (1.0666367913704846, 1.0115700153267637)
R_mean_7 = (1.0792594082909468, 0.9994381485553782)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (1.0585539843184617, 0.9865728256274376)
R_mean_7 = (1.0917412038964787, 1.0185792892156231)


Poly3 for Sardegna with lag = 4: 2020-10-16 00:00:00 -> 2021-01-17 00:00:00, (94, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-16 00:00:00 -> 2020-10-15 00:00:00
 target_4 = (1.0851679321361742, 1.1945541587622768)
R_mean_7 = (2.290302424319552, 1.2540241453800856)
AFTER: 2020-05-12 00:00:00 -> 2020-10-16 00:00:00
 target_4 = (1.3432629133833882, 1.1816666651078256)
R_mean_7 = (1.0160374306820084, 1.2439194502867055)


LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_4 = (2.0008803220670996, 1.0726835339921128)
R_mean_7 = (2.378977203260382, 1.0745465133835546)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target

R_mean_7 = (0.7532771444490076, 0.8882983574335417)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_4 = (1.3018324704353859, 1.0461484991404606)
R_mean_7 = (1.5780049192312402, 0.9309452563760454)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_4 = (1.267862213551905, 1.0378096497537654)
R_mean_7 = (1.5956790488415598, 0.9450106805320104)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (0.9889311508504899, 0.9026352998985281)
R_mean_7 = (1.0917261253499895, 1.131894483015449)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (0.9795901956045985, 0.8797822854303254)
R_mean_7 = (1.074148914164577, 1.1358452322157782)


Poly2 for Umbria with lag = 1: 2020-10-16 00:00:00 -> 2021-01-20 00:00:00, (97, 36)


LOG: 2020-10-16 00:00:00
BEFORE: 2020-03-13 00:00:00 -> 2020-10-15 00:00:00
 target_1 = (1.612657413533502, 1.363840067094043)
R_mean_7 = (2.992584730621255, 1.5342837210092082)
AFTER: 20

LOG: 2020-11-05 00:00:00
BEFORE: 2020-05-31 00:00:00 -> 2020-11-04 00:00:00
 target_1 = (0.7899853420348282, 1.0762846810267512)
R_mean_7 = (0.7999826858804902, 1.1725648357652858)
AFTER: 2020-06-01 00:00:00 -> 2020-11-05 00:00:00
 target_1 = (0.7978612536662419, 1.056260593930809)
R_mean_7 = (0.7828720845612446, 1.179722757758951)


LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_1 = (0.7497021552502973, 0.738693795572963)
R_mean_7 = (1.0611753995317856, 0.7577271912553776)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_1 = (0.752731361148903, 0.7537299604201955)
R_mean_7 = (0.9993151685318964, 0.7464594674347798)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_1 = (1.4353795829243599, 0.9586885856756323)
R_mean_7 = (1.6271495414578405, 0.8704498607968718)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_1 = (1.4004976817019814, 0.9476461807460993)
R_mean_7 = (1.6197757499905157, 0.8816696300

LOG: 2020-11-25 00:00:00
BEFORE: 2020-06-20 00:00:00 -> 2020-11-24 00:00:00
 target_4 = (1.379738705879741, 0.7724547390469105)
R_mean_7 = (1.1096371193461485, 0.7263734890933353)
AFTER: 2020-06-21 00:00:00 -> 2020-11-25 00:00:00
 target_4 = (1.4168042199944098, 0.8057263755071644)
R_mean_7 = (1.054437972451796, 0.7004924407820112)


LOG: 2020-12-15 00:00:00
BEFORE: 2020-10-16 00:00:00 -> 2020-12-14 00:00:00
 target_4 = (1.4067837329185389, 0.9853119441650675)
R_mean_7 = (1.4555496025296426, 0.9162486367196836)
AFTER: 2020-10-17 00:00:00 -> 2020-12-15 00:00:00
 target_4 = (1.3786173993755795, 0.9683241564838233)
R_mean_7 = (1.4301883804924196, 0.9210236717264397)


LOG: 2021-01-04 00:00:00
BEFORE: 2020-11-05 00:00:00 -> 2021-01-03 00:00:00
 target_4 = (0.9315113215518322, 0.9639626470877942)
R_mean_7 = (1.0440143057998732, 1.040978664579893)
AFTER: 2020-11-06 00:00:00 -> 2021-01-04 00:00:00
 target_4 = (0.9178246172108369, 0.9607437381832558)
R_mean_7 = (1.017526061475317

In [132]:
pd.options.display.max_rows = 80

In [133]:
df_test_prediction.loc[(df_test_prediction.index.get_level_values('Regione')=='Trentino-Alto Adige')&(df_test_prediction.index.get_level_values('Date')>pd.to_datetime('2020-10-25'))&(df_test_prediction.index.get_level_values('Date')<pd.to_datetime('2021-01-10'))]

Unnamed: 0_level_0,Unnamed: 1_level_0,target_1,target_4,Handover_smoothened_1,Download vol._smoothened_1,Upload vol._smoothened_1,#Users_smoothened_1,R_mean_1,Handover_smoothened_2,Download vol._smoothened_2,Upload vol._smoothened_2,#Users_smoothened_2,R_mean_2,Handover_smoothened_3,Download vol._smoothened_3,Upload vol._smoothened_3,#Users_smoothened_3,R_mean_3,Handover_smoothened_4,Download vol._smoothened_4,Upload vol._smoothened_4,#Users_smoothened_4,R_mean_4,Handover_smoothened_5,Download vol._smoothened_5,Upload vol._smoothened_5,#Users_smoothened_5,R_mean_5,Handover_smoothened_6,Download vol._smoothened_6,Upload vol._smoothened_6,#Users_smoothened_6,R_mean_6,Handover_smoothened_7,Download vol._smoothened_7,Upload vol._smoothened_7,#Users_smoothened_7,R_mean_7
Date,Regione,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
2020-10-26,Trentino-Alto Adige,1.201458,1.178934,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757,0.228339,0.297825,0.104416,0.428422,1.289144,0.269801,0.282002,0.100375,0.446883,1.305061,0.292915,0.267818,0.092064,0.450409,1.32258,0.300798,0.250409,0.074142,0.438799,1.350639,0.298632,0.239135,0.050776,0.415976,1.395191
2020-10-27,Trentino-Alto Adige,1.193741,1.159298,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757,0.228339,0.297825,0.104416,0.428422,1.289144,0.269801,0.282002,0.100375,0.446883,1.305061,0.292915,0.267818,0.092064,0.450409,1.32258,0.300798,0.250409,0.074142,0.438799,1.350639
2020-10-28,Trentino-Alto Adige,1.187642,1.129114,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757,0.228339,0.297825,0.104416,0.428422,1.289144,0.269801,0.282002,0.100375,0.446883,1.305061,0.292915,0.267818,0.092064,0.450409,1.32258
2020-10-29,Trentino-Alto Adige,1.178934,1.09366,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757,0.228339,0.297825,0.104416,0.428422,1.289144,0.269801,0.282002,0.100375,0.446883,1.305061
2020-10-30,Trentino-Alto Adige,1.159298,1.060036,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757,0.228339,0.297825,0.104416,0.428422,1.289144
2020-10-31,Trentino-Alto Adige,1.129114,1.030763,-0.124501,0.092465,-0.033145,0.10329,1.178934,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745,0.160328,0.307784,0.102522,0.390815,1.26757
2020-11-01,Trentino-Alto Adige,1.09366,1.006382,-0.284524,0.015564,-0.110616,-0.117397,1.159298,-0.124501,0.092465,-0.033145,0.10329,1.178934,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772,0.112607,0.305033,0.102487,0.373248,1.241745
2020-11-02,Trentino-Alto Adige,1.060036,0.985411,-0.25389,0.143758,-0.029269,0.053099,1.129114,-0.284524,0.015564,-0.110616,-0.117397,1.159298,-0.124501,0.092465,-0.033145,0.10329,1.178934,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458,0.097099,0.274554,0.097356,0.376656,1.217772
2020-11-03,Trentino-Alto Adige,1.030763,0.965369,-0.199033,0.213926,0.033571,0.178506,1.09366,-0.25389,0.143758,-0.029269,0.053099,1.129114,-0.284524,0.015564,-0.110616,-0.117397,1.159298,-0.124501,0.092465,-0.033145,0.10329,1.178934,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741,0.078227,0.237846,0.086319,0.360894,1.201458
2020-11-04,Trentino-Alto Adige,1.006382,0.945146,-0.173859,0.272631,0.084371,0.250394,1.060036,-0.199033,0.213926,0.033571,0.178506,1.09366,-0.25389,0.143758,-0.029269,0.053099,1.129114,-0.284524,0.015564,-0.110616,-0.117397,1.159298,-0.124501,0.092465,-0.033145,0.10329,1.178934,-0.024568,0.149104,0.021902,0.237435,1.187642,0.040954,0.195333,0.061563,0.317022,1.193741


In [None]:
# TODO provare a fare le stesse print su df_train per capire se i dati sono allineati (e.g. se in zona luglio con target lontano non sto smongolando perchè tronco i dati visto il vuoto) -> X -> dati del covid li abbiamo tutti, stare attenti a predirre più di 30
# TODO provare con funzione test_... se è corretto -> OK, sono corretti!
# TODO provare a plottare in più situazioni in cui la curva aumenta e confrontare con lo shifted di N: potrebbe dare l'idea e quindi potrebbe essere che il modello è actually giusto
# TODO provare a trainare solamente con dati di traffico in input
# TODO provare a vedere se, quando faccio walk forward, non sto dando dati errati -> OK, sono corretti!

In [89]:
# 1 check se tronco
df_train_prediction.loc[(df_train_prediction.index.get_level_values('Regione')=='Toscana')&(df_train_prediction.index.get_level_values('Date')<pd.to_datetime('2020-08-01'))]

Unnamed: 0_level_0,Unnamed: 1_level_0,target_1,target_4,Handover_smoothened_1,Download vol._smoothened_1,Upload vol._smoothened_1,#Users_smoothened_1,R_mean_1,Handover_smoothened_2,Download vol._smoothened_2,Upload vol._smoothened_2,#Users_smoothened_2,R_mean_2,Handover_smoothened_3,Download vol._smoothened_3,Upload vol._smoothened_3,#Users_smoothened_3,R_mean_3,Handover_smoothened_4,Download vol._smoothened_4,Upload vol._smoothened_4,#Users_smoothened_4,R_mean_4,Handover_smoothened_5,Download vol._smoothened_5,Upload vol._smoothened_5,#Users_smoothened_5,R_mean_5,Handover_smoothened_6,Download vol._smoothened_6,Upload vol._smoothened_6,#Users_smoothened_6,R_mean_6,Handover_smoothened_7,Download vol._smoothened_7,Upload vol._smoothened_7,#Users_smoothened_7,R_mean_7
Date,Regione,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
2020-03-13,Toscana,1.409653,1.205656,0.086667,0.838634,0.899637,0.839822,1.549647,0.242482,0.785961,0.784053,0.861876,1.609496,0.387687,0.731152,0.662746,0.874244,1.675746,0.519333,0.686303,0.547232,0.876614,1.772866,0.623368,0.657092,0.448864,0.866889,1.961975,0.738432,0.623472,0.372198,0.873611,2.302152,0.861080,0.594543,0.323952,0.898550,2.883786
2020-03-14,Toscana,1.334812,1.155466,-0.056382,0.877591,1.000454,0.812301,1.483864,0.086667,0.838634,0.899637,0.839822,1.549647,0.242482,0.785961,0.784053,0.861876,1.609496,0.387687,0.731152,0.662746,0.874244,1.675746,0.519333,0.686303,0.547232,0.876614,1.772866,0.623368,0.657092,0.448864,0.866889,1.961975,0.738432,0.623472,0.372198,0.873611,2.302152
2020-03-15,Toscana,1.266516,1.112814,-0.195722,0.907230,1.080408,0.779590,1.409653,-0.056382,0.877591,1.000454,0.812301,1.483864,0.086667,0.838634,0.899637,0.839822,1.549647,0.242482,0.785961,0.784053,0.861876,1.609496,0.387687,0.731152,0.662746,0.874244,1.675746,0.519333,0.686303,0.547232,0.876614,1.772866,0.623368,0.657092,0.448864,0.866889,1.961975
2020-03-16,Toscana,1.205656,1.077190,-0.281092,0.916625,1.151290,0.764963,1.334812,-0.195722,0.907230,1.080408,0.779590,1.409653,-0.056382,0.877591,1.000454,0.812301,1.483864,0.086667,0.838634,0.899637,0.839822,1.549647,0.242482,0.785961,0.784053,0.861876,1.609496,0.387687,0.731152,0.662746,0.874244,1.675746,0.519333,0.686303,0.547232,0.876614,1.772866
2020-03-17,Toscana,1.155466,1.040578,-0.298593,0.913986,1.217295,0.772684,1.266516,-0.281092,0.916625,1.151290,0.764963,1.334812,-0.195722,0.907230,1.080408,0.779590,1.409653,-0.056382,0.877591,1.000454,0.812301,1.483864,0.086667,0.838634,0.899637,0.839822,1.549647,0.242482,0.785961,0.784053,0.861876,1.609496,0.387687,0.731152,0.662746,0.874244,1.675746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-27,Toscana,1.418640,1.174384,0.784826,0.290727,0.213295,0.422114,1.441809,0.801694,0.307781,0.246150,0.455761,1.416804,0.807157,0.325903,0.268909,0.478144,1.379739,0.813372,0.339284,0.287148,0.494207,1.328699,0.820491,0.351721,0.293651,0.502813,1.241947,0.828898,0.364256,0.291031,0.503196,1.195720,0.850103,0.360219,0.294665,0.513637,1.114787
2020-06-28,Toscana,1.361848,1.085164,0.729535,0.263418,0.165130,0.368073,1.458513,0.784826,0.290727,0.213295,0.422114,1.441809,0.801694,0.307781,0.246150,0.455761,1.416804,0.807157,0.325903,0.268909,0.478144,1.379739,0.813372,0.339284,0.287148,0.494207,1.328699,0.820491,0.351721,0.293651,0.502813,1.241947,0.828898,0.364256,0.291031,0.503196,1.195720
2020-06-29,Toscana,1.271053,1.027668,0.644792,0.210194,0.113431,0.299335,1.418640,0.729535,0.263418,0.165130,0.368073,1.458513,0.784826,0.290727,0.213295,0.422114,1.441809,0.801694,0.307781,0.246150,0.455761,1.416804,0.807157,0.325903,0.268909,0.478144,1.379739,0.813372,0.339284,0.287148,0.494207,1.328699,0.820491,0.351721,0.293651,0.502813,1.241947
2020-06-30,Toscana,1.174384,0.982454,0.545263,0.109779,0.048261,0.216870,1.361848,0.644792,0.210194,0.113431,0.299335,1.418640,0.729535,0.263418,0.165130,0.368073,1.458513,0.784826,0.290727,0.213295,0.422114,1.441809,0.801694,0.307781,0.246150,0.455761,1.416804,0.807157,0.325903,0.268909,0.478144,1.379739,0.813372,0.339284,0.287148,0.494207,1.328699


In [91]:
df_covid_predictions.loc[(df_covid_predictions.index.get_level_values('Regione')=='Toscana')&(df_covid_predictions.index.get_level_values('Date')<pd.to_datetime('2020-07-10'))&(df_covid_predictions.index.get_level_values('Date')>pd.to_datetime('2020-06-20'))]['R_mean']

Date        Regione
2020-06-21  Toscana    1.195720
2020-06-22  Toscana    1.241947
2020-06-23  Toscana    1.328699
2020-06-24  Toscana    1.379739
2020-06-25  Toscana    1.416804
2020-06-26  Toscana    1.441809
2020-06-27  Toscana    1.458513
2020-06-28  Toscana    1.418640
2020-06-29  Toscana    1.361848
2020-06-30  Toscana    1.271053
2020-07-01  Toscana    1.174384
2020-07-02  Toscana    1.085164
2020-07-03  Toscana    1.027668
2020-07-04  Toscana    0.982454
2020-07-05  Toscana    0.973146
2020-07-06  Toscana    0.977118
2020-07-07  Toscana    0.989556
2020-07-08  Toscana    1.009223
2020-07-09  Toscana    1.035281
Name: R_mean, dtype: float64

In [None]:
# TODO handover ha spikes, mooolto dovuto al fatto che ci sono dei casi a 0! basta fare preprocessing! -> OK
# TODO capire perchè poly2 e poly3 hanno valori iniziali molto sballati -> secondo me perchè vengono influenzati da prima parte di training più degli altri!!!