In [3]:
import os
import itertools

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import random

from os import listdir
from os.path import isfile, join
import pickle

plt.style.use('seaborn-white')

%matplotlib inline

from scipy.stats import gamma, poisson

import epyestim
import epyestim.covid19 as covid19
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE, r2_score
from xgboost import XGBRegressor, DMatrix, train
from sklearn.multioutput import MultiOutputRegressor

from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from pykalman import KalmanFilter

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, TimeDistributed, RepeatVector
from keras.callbacks import EarlyStopping

import plotly.express as px
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from functools import partial
import copy


to_sum_KPIs = ['totale_casi_giornalieri', 'terapia_intensiva_giornalieri', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri']
covidKPIsPrecompute = ['%pos']+to_sum_KPIs
trafficKPIsPrecompute = ['Handover', 'Download vol.', 'Upload vol.', '#Users']

# sums regions such as trento + bolzano
def sumRegions(df, dateCol = 'Date', regionCol='Regione', cols = to_sum_KPIs, region1 = "P.A. Bolzano", region2 = "P.A. Trento", regionNew = "Trentino-Alto Adige"):
    dfRegion1, dfRegion2 = df.loc[df[regionCol] == region1], df.loc[df[regionCol] == region2]
    dfRegion1.set_index(dateCol, inplace=True)
    dfRegion2.set_index(dateCol, inplace=True)
    newVals = dfRegion1[to_sum_KPIs]+dfRegion2[to_sum_KPIs]
    newVals.reset_index(inplace=True)
    newVals['Regione'] = regionNew
    df = df.loc[(df[regionCol] != region1) & (df[regionCol] != region2)]
    return df.append(newVals)

# adds italy as cumulative over days
def addItalyData(df, cols):
    dfTemp = df.resample('D', on='Date').sum().reset_index()
    dfTemp['Regione']='Italia'
    dfTemp = dfTemp[cols]
    return pd.concat([df, dfTemp])

def fill_with_areas(dateRange, fig, is_train):
    if is_train:
        color = 'rgba(255, 0, 0, 0.2)'
    else:
        color = 'rgba(0, 0, 255, 0.2)'
    fig.add_shape(type="rect",
        yref="paper",
        x0=dateRange[0], y0=0,
        x1=dateRange[-1], y1=1,
        line=dict(
            width=0,
        ),
        fillcolor=color,
    )
    
    return fig

data_path = "/Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/"
by_region_path = "{}By_Region/".format(data_path)
saved = "{}saved/".format(data_path)
traffic_daily = "{}TS_1800_daily.pkl".format(saved)
region_traffic_daily = "{}all.pkl".format(saved)
covid = "{}covid/".format(data_path)
covid_daily = "{}covid_2303.csv".format(covid)

capped_last_date = pd.to_datetime('2021-01-15')

# Preprocessing

## Handle temperature data

### Import

In [4]:
meteo_path = "{}meteo/".format(data_path)
dfs_filenames = [f for f in listdir(meteo_path) if isfile(join(meteo_path, f))]
dfs = []
path_temperature_predictions = "{}predictions/temperatures.csv".format(saved)
SAVE_TEMPERATURE = False
if SAVE_TEMPERATURE:
    for f in dfs_filenames:
        splits = f.split("_")
        if len(splits) == 2 and "." in splits[1]:
            filename = "{}{}".format(meteo_path, f)
            current_df = pd.read_csv(filename)
            region_name = splits[0]
            #if "rentino" not in region_name and "osta" not in region_name:
            month = splits[1][4:].split(".")[0]
            current_df['Regione'] = [r for r in regions_covid if region_name in r.lower()][0]
            current_df['month'] = int(month)
            current_df['year'] = int(2021 if "2021" in filename else 2020)
            dfs.append(current_df)
        df_temperature = pd.concat(dfs)
        df_temperature['Date'] = df_temperature.apply(lambda x: pd.to_datetime("{}/{}/{}".format(x.year, x.month, int(x.date.split(" ")[1]))), axis=1)
        df_temperature.set_index(['Date', 'Regione'], inplace=True)
        df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
        df_temperature.to_csv(path_temperature_predictions)
else:
    df_temperature = pd.read_csv(path_temperature_predictions)
    df_temperature['Date'] = pd.to_datetime(df_temperature['Date'])
    df_temperature.set_index(['Date', 'Regione'], inplace=True)
    
regions_temperature = df_temperature.index.get_level_values(1).unique()
regions = regions_temperature

## Handle COVID data

### Import

In [5]:
recompute_rt = False
import_covid  = True
path_covid = "{}covid.csv".format(saved)

if import_covid:
    df_covid = pd.read_csv(covid_daily)
    if "Regione" not in df_covid.columns:
        df_covid.rename(columns={'denominazione_regione': 'Regione'}, inplace=True)
        df_covid['tamponi_giornalieri'] = df_covid.groupby([
                        'Regione'])['tamponi'].diff()
        df_covid.loc[df_covid['tamponi_giornalieri'].isna() ,
                               'tamponi_giornalieri'] = df_covid['tamponi']


        df_covid['deceduti_giornalieri'] = df_covid.groupby([
                            'Regione'])['deceduti'].diff()
        df_covid.loc[df_covid['deceduti_giornalieri'].isna() ,
                               'deceduti_giornalieri'] = df_covid['deceduti']

        df_covid['terapia_intensiva_giornalieri'] = df_covid.groupby([
                            'Regione'])['terapia_intensiva'].diff()
        df_covid.loc[df_covid['terapia_intensiva_giornalieri'].isna() ,
                               'terapia_intensiva_giornalieri'] = df_covid['terapia_intensiva']

        df_covid['totale_casi_giornalieri'] = df_covid.groupby([
                            'Regione'])['totale_casi'].diff()
        df_covid.loc[df_covid['totale_casi_giornalieri'].isna() ,
                               'totale_casi_giornalieri'] = df_covid['totale_casi']
    covid_cols = ['Date', 'Regione', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri', 'totale_casi', 'deceduti', 'totale_casi_giornalieri', 'terapia_intensiva_giornalieri']
    
    try:
        df_covid.data = pd.to_datetime(df_covid.data)
        df_covid.rename(columns={'data': 'Date'}, inplace=True)
    except:
        pass # already correct name
    df_covid = sumRegions(df_covid)
    regions_covid = df_covid['Regione'].unique()
    #df_covid = df_covid[df_covid['Regione'].isin(regions)].dropna()
    df_covid.to_csv(path_covid)
else:
    try:
        del df_covid
    except:
        print("No df covid")

### Compute Rt

In [6]:
dfs = []
path_covid_predictions="{}predictions/covid_2303.pkl".format(saved)
if recompute_rt:
    for r in regions:
        print("REGIONE: {}".format(r))
        current_df = df_covid.loc[df_covid['Regione'] == r]
        current_df['Date'] = pd.to_datetime(current_df['Date']).dt.date
        current_df['DateIndex'] = current_df.loc[:, 'Date']
        current_df.set_index('DateIndex', inplace=True)
        #current_df = current_df.loc[current_df['nuovi_positivi'] > 0]
        current_df = current_df.loc[pd.to_datetime('2020/03/01'):]
        idxs = (current_df['nuovi_positivi'] < 0)# | (current_df.isna()) | (current_df['nuovi_positivi'] == np.inf) | (current_df['nuovi_positivi'] == -np.inf)
        if idxs.sum() > 0:
            current_df.loc[idxs, 'nuovi_positivi'] = np.nan
        current_df.fillna(method='ffill', inplace=True)
        current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #current_df[current_df.loc[:, 'nuovi_positivi']]
        #current_df.dropna(subset=['nuovi_positivi'], inplace=True)
        #
        current_df = current_df.drop_duplicates(keep='first')
        #print(current_df['nuovi_positivi'].shape, current_df['nuovi_positivi'].apply(lambda x: x < 0).sum())
        #current_df.dropna(subset=['totale_casi_giornalieri'], inplace=True)
        #print(current_df['totale_casi_giornalieri'].isna().sum())
        #print(current_df['totale_casi_giornalieri'].sum())
        r_t_series = covid19.r_covid(current_df['nuovi_positivi'])
        current_df = pd.merge(current_df, r_t_series, left_index=True, right_index=True)
        dfs.append(current_df)
    df_covid_predictions = pd.concat(dfs)
    del dfs
    df_covid_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_covid_predictions['%pos'] = (df_covid_predictions['nuovi_positivi']/df_covid_predictions['tamponi_giornalieri'])
    df_covid_predictions.to_pickle(path_covid_predictions)
else:
    df_covid_predictions = pd.read_pickle(path_covid_predictions)

df_unseen_covid = df_covid_predictions.loc[df_covid_predictions.index.get_level_values('Date')>=capped_last_date]
#df_covid_predictions = df_covid_predictions.loc[df_covid_predictions.index.get_level_values('Date')<capped_last_date]

## Handle traffic data

### Import

In [10]:
import_traffic = True
recompute_kalman = False
recompute_MA = True
path_traffic = "{}traffic.csv".format(saved)
start_date_unavailable, end_date_unavailable = pd.to_datetime('2020-08-01'), pd.to_datetime('2020-10-01')
dates_unavailable = pd.date_range(start_date_unavailable, end_date_unavailable)
if import_traffic:
    df_traffic_daily = pd.read_pickle(region_traffic_daily)
    df_traffic_daily.loc[df_traffic_daily['Regione'] == "Emilia Romagna", "Regione"] = "Emilia-Romagna"
    df_traffic_predictions = df_traffic_daily.loc[df_traffic_daily['Regione'].isin(regions)]
    df_traffic_predictions = df_traffic_predictions.groupby('Regione').resample('D', on='Date').sum().reset_index()
    df_traffic_predictions = df_traffic_predictions.replace({'0':np.nan, 0:np.nan})
    df_traffic_predictions = df_traffic_predictions.loc[(df_traffic_predictions['Date']<start_date_unavailable)|(df_traffic_predictions['Date']>=end_date_unavailable)]
    df_traffic_predictions = df_traffic_predictions.fillna(method='ffill')
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)
    df_traffic_predictions.to_csv(path_traffic)
else:
    df_traffic_predictions = pd.read_csv(path_traffic)
    df_traffic_predictions['Date'] = pd.to_datetime(df_traffic_predictions['Date']).dt.date
    df_traffic_predictions.set_index(['Date', 'Regione'], inplace=True)

In [11]:
# apply MA to traffic features
path_traffic_predictions="{}predictions/traffic.pkl".format(saved)
if recompute_MA:
    for trafficKPI in trafficKPIsPrecompute:
        #current_df_kalman = pd.DataFrame({"{}_smoothened".format(trafficKPI): []})
        dfs_current_kpi = []
        for region in regions:
            series = df_traffic_predictions.xs(region, level=1)[trafficKPI]
            # t = 21/03 -> current_series = series[:t] -> .em(current_series) -> features_for_day_t = smooth(current_series)
            rolling_amount = 7
            series_ma = series.rolling(rolling_amount).mean()
            
            series_ma = series_ma.loc[(series_ma.index<start_date_unavailable)|(series_ma.index>=end_date_unavailable+pd.Timedelta(days=rolling_amount))]
            
            df_region_kpi = pd.DataFrame({"noisy": series})
            df_region_kpi['smooth'] = series_ma
            df_region_kpi['Regione'] = region
            df_region_kpi.reset_index(inplace=True)
            df_region_kpi.set_index(['Date', 'Regione'], inplace=True)
            dfs_current_kpi.append(df_region_kpi)

        df_traffic_predictions["{}_MA".format(trafficKPI)] = pd.concat(dfs_current_kpi)['smooth']
        df_traffic_predictions.to_pickle(path_traffic_predictions)
else:
    df_traffic_predictions = pd.read_pickle(path_traffic_predictions)

# Forecasting

## Define KPIs

In [14]:
trafficKPIs = [col for col in df_traffic_predictions.columns if "MA" in col]
covidKPIs = [col for col in df_covid_predictions.columns if "mean" in col]
temperatureKPIs = []#[col for col in df_temperature.columns if "min" in col]
targetCovid = ['R_mean']

In [15]:
trafficKPIs

['Handover_MA', 'Download vol._MA', 'Upload vol._MA', '#Users_MA']

In [16]:
def build_df_prediction(range_dates, min_farsightness, farsightness, delta_features, step_target):
    lags = range(delta_features)
    lags_target = range(min_farsightness, farsightness, step_target)
    all_dfs = []
    # prima ondata
    for region in regions_to_train:
        # filter ts by region
        df_traffic_ts = df_traffic_predictions.loc[(df_traffic_predictions.index.get_level_values(1)==region), trafficKPIs].copy()
        df_covid_ts = df_covid_predictions.loc[df_covid_predictions.index.get_level_values(1)==region, list(set(covidKPIs+targetCovid))].copy()
        df_temperature_ts = df_temperature.loc[df_temperature.index.get_level_values(1)==region, temperatureKPIs].copy()
        
        df_traffic_ts = df_traffic_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))
        df_temperature_ts = df_temperature_ts.groupby(level=1).transform(lambda x: (x-x.mean())/x.std(ddof=1))

        df_covid_ts.reset_index(inplace=True)
        df_temperature_ts.reset_index(inplace=True)
        df_traffic_ts.reset_index(inplace=True)

        df_covid_ts = df_covid_ts.set_index('Date')
        df_temperature_ts = df_temperature_ts.set_index('Date')
        df_traffic_ts = df_traffic_ts.set_index('Date')

        df_ts = pd.DataFrame()
        features = []
        targets = []
        target_col = targetCovid[0]
        df_target_ts = df_covid_ts.copy()

        train_dates_intersection = df_traffic_ts.index.intersection(df_covid_ts.index)

        train_dates = []
        if len(temperatureKPIs) > 0:
            train_dates_intersection = train_dates_intersection.isin(df_temperature_ts.index)
        for date_val in train_dates_intersection:
            if any(date_val in x for x in range_dates):
                train_dates.append(date_val)

        train_dates = pd.to_datetime(train_dates)

        df_covid_ts, df_traffic_ts, df_temperature_ts = df_covid_ts.loc[df_covid_ts.index.isin(train_dates)], df_traffic_ts.loc[df_traffic_ts.index.isin(train_dates)], df_temperature_ts.loc[df_temperature_ts.index.isin(train_dates)]

        for lag in lags_target:
            target = "target_{}".format(lag)
            targets.append(target)
            df_ts[target] = df_target_ts.copy().shift(-1*lag)[target_col]
        
        # use also today feature
        use_today_feature = True
        if use_today_feature:
            if len(trafficKPIs) > 0:
                df_ts[trafficKPIs] = df_traffic_ts[trafficKPIs].copy()
                features += trafficKPIs
            if len(covidKPIs) > 0:
                df_ts[covidKPIs] = df_covid_ts[covidKPIs].copy()
                features += covidKPIs
        
        for lag in lags:
            lag_shift = lag+1
            for col in trafficKPIs:
                feature = "{}_{}".format(col, lag_shift)
                #print(feature, df_traffic_ts.shift(lag_shift).loc[:, col])
                df_ts[feature] = df_traffic_ts.copy().shift(lag_shift).loc[:, col]
                features.append(feature)
            for col in covidKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_covid_ts.copy().shift(lag_shift)[col]
                features.append(feature)
            for col in temperatureKPIs:
                feature = "{}_{}".format(col, lag_shift)
                df_ts[feature] = df_temperature_ts.copy().shift(lag_shift)[col]
                features.append(feature)

        df_ts = df_ts[targets+features]
        df_ts.dropna(subset=features, inplace=True)
        df_ts['Regione'] = region
        df_ts = df_ts.reset_index().set_index(['Date', 'Regione'])
        all_dfs.append(df_ts.copy())
    return pd.concat(all_dfs), targets, features, lags, lags_target

In [41]:
divider_po, divider_so = pd.to_datetime('2020-07-01'), pd.to_datetime('2020-10-01')
start_train_po, end_train_so = pd.to_datetime('2020-03-01'), pd.to_datetime('2020-10-25')
start_po = start_train_po
ranges_train_PO = [pd.date_range(start_po, divider_po)]
ranges_train_SO = [pd.date_range(divider_so, end_train_so)]
last_date_with_unseen = min(df_traffic_predictions.index.get_level_values(0).max(), df_unseen_covid.index.get_level_values(0).max())

last_date = min(last_date_with_unseen, capped_last_date)

ranges_unseen = [pd.date_range(last_date, last_date_with_unseen)]

if len(temperatureKPIs) > 0:
    last_date = min(last_date, df_temperature.index.get_level_values(0).max())
ranges_so = [pd.date_range(divider_so, last_date)]
regions_to_train = regions

min_farsightness = 1
farsightness = 35
delta_features = 7
step_target = 1

ranges_test_SO = [pd.date_range(end_train_so-pd.Timedelta(days=delta_features-1), last_date_with_unseen)]

(df_train_prediction_PO, targets, features, lags, lags_target), (df_train_prediction_SO, _, _, _, _) = \
    build_df_prediction(ranges_train_PO, min_farsightness, farsightness, delta_features, step_target), \
    build_df_prediction(ranges_train_SO, min_farsightness, farsightness, delta_features, step_target)

(df_test_prediction, _, _, _, _) = build_df_prediction(ranges_test_SO, min_farsightness, farsightness, delta_features, step_target)
df_train_prediction = pd.concat([df_train_prediction_PO, df_train_prediction_SO])

df_unseen_prediction = df_test_prediction.loc[df_test_prediction.index.get_level_values('Date')>capped_last_date]
df_test_prediction = df_test_prediction.loc[df_test_prediction.index.get_level_values('Date')<=capped_last_date]

In [74]:
params = {'objective': 'reg:squarederror'}

is_xgb = False
is_polynomial = False
is_lstm = True

features_without_lag = trafficKPIs+covidKPIs+temperatureKPIs
num_features_t = len(features_without_lag)
n_timesteps = len(lags)+1

features = [col for col in df_test_prediction.columns if "target" not in col]
targets = [col for col in df_test_prediction.columns if "target" in col]

print("FEATURES: {}, TARGETS: {}".format(features, targets))

MAX_EPOCHS = 128

def build_mlp(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(128, input_shape=(n_timesteps, n_features)))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    return model

def build_lstm_1(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(128, input_shape=(n_timesteps, n_features)))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    return model

def build_lstm_2(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(128, input_shape=(n_timesteps, n_features)))
    model.add(Dense(32))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    return model

def build_lstm_3(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(1))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(32, activation='relu')))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mse', optimizer='adam')
    return model

def build_lstm_4(n_timesteps, n_features, n_outputs):
    
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(n_outputs))
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(32, activation='relu')))
    model.add(TimeDistributed(Dense(1)))

    model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsoluteError()])
    
    return model

def build_lstm_5(n_timesteps, n_features, n_outputs):
    
    model = Sequential()
    model.add(LSTM(200, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(n_outputs))
    model.add(LSTM(100, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(64, activation='relu')))
    model.add(TimeDistributed(Dense(1)))

    model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsoluteError()])
    
    return model

models_constructors = {
    "EncDecLSTM_DL": partial(build_lstm_3, n_timesteps, num_features_t)
}

def build_model(model_name, n_outputs):
    return build_lstm_5(n_timesteps, num_features_t, n_outputs)
    
model_name = [m for m in models_constructors.keys() if "DL" in m][0]
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def train_in_interval(interval):
    models_regions = {}
    for region in regions_to_train:
        df_ts = df_train_prediction.loc[df_train_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        
        #df_ts_lag = df_ts.copy().drop(columns=[col for col in targets if (col not in features) and (col != target)]).dropna()
        model = build_model(model_name, len(targets))
        #for model_name in models_constructors.keys():
        print("{} for {} with lag ({}-{}): {} -> {}".format(model_name, region, min(lags_target), max(lags_target), min(interval), max(interval)))
        
        size_int = int(df_ts.shape[0]*0.7)
        features_train, features_val = df_ts.iloc[:size_int][features], df_ts.iloc[size_int:][features]
        targets_train, targets_val = df_ts.loc[features_train.index, targets].dropna(), df_ts.loc[features_val.index, targets].dropna()
        
        train_indices = features_train.index.intersection(targets_train.index)
        val_indices = features_val.index.intersection(targets_val.index)
        
        print("{}, {}".format(train_indices.shape, val_indices.shape))
        '''
        features_train, features_val, targets_train, targets_val = \
            features_train.loc[train_indices], features_val.loc[val_indices], \
            targets_train.loc[train_indices], targets_val.loc[val_indices]
        '''
        
        lstm_input_train = features_train.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
        lstm_input_val = features_val.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
        
        print("DBG: {} + {}, {} + {}".format(lstm_input_train.shape, lstm_input_val.shape,targets_train.shape, targets_val.shape))
        
        model.fit(x=lstm_input_train, y=targets_train, epochs=MAX_EPOCHS,
                  validation_data=(lstm_input_val, targets_val),
                  callbacks=[early_stopping])
        models_regions[region] = model
    return models_regions

FEATURES: ['Handover_MA', 'Download vol._MA', 'Upload vol._MA', '#Users_MA', 'R_mean', 'Handover_MA_1', 'Download vol._MA_1', 'Upload vol._MA_1', '#Users_MA_1', 'R_mean_1', 'Handover_MA_2', 'Download vol._MA_2', 'Upload vol._MA_2', '#Users_MA_2', 'R_mean_2', 'Handover_MA_3', 'Download vol._MA_3', 'Upload vol._MA_3', '#Users_MA_3', 'R_mean_3', 'Handover_MA_4', 'Download vol._MA_4', 'Upload vol._MA_4', '#Users_MA_4', 'R_mean_4', 'Handover_MA_5', 'Download vol._MA_5', 'Upload vol._MA_5', '#Users_MA_5', 'R_mean_5', 'Handover_MA_6', 'Download vol._MA_6', 'Upload vol._MA_6', '#Users_MA_6', 'R_mean_6', 'Handover_MA_7', 'Download vol._MA_7', 'Upload vol._MA_7', '#Users_MA_7', 'R_mean_7'], TARGETS: ['target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9', 'target_10', 'target_11', 'target_12', 'target_13', 'target_14', 'target_15', 'target_16', 'target_17', 'target_18', 'target_19', 'target_20', 'target_21', 'target_22', 'target_23', 'target_24

In [75]:
do_train = True

if do_train:
    models_regions = train_in_interval((start_train_po, end_train_so))

EncDecLSTM_DL for Trentino-Alto Adige with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(83,), (36,)
DBG: (83, 8, 5) + (36, 8, 5), (83, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
EncDecLSTM_DL for Puglia with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (36,)
DBG: (84, 8, 5) + (36, 8, 5), (84, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
EncDecLSTM_DL for Emilia-Romagna with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
EncDecLSTM_DL for Veneto with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:

Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
EncDecLSTM_DL for Lombardia with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
EncDecLSTM_DL for Abruzzo with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(83,), (36,)
DBG: (83, 8, 5) + (36, 8, 5), (83, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
EncDecLSTM_DL for Molise with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(83,), (36,)
DBG: (83, 8, 5) + (36, 8, 5), (83, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
EncDecLSTM_DL for Sicilia with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00

Epoch 10/128
Epoch 11/128
EncDecLSTM_DL for Valle d'Aosta with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(81,), (36,)
DBG: (81, 8, 5) + (36, 8, 5), (81, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
EncDecLSTM_DL for Calabria with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(81,), (36,)
DBG: (81, 8, 5) + (36, 8, 5), (81, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
EncDecLSTM_DL for Liguria with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(83,), (36,)
DBG: (83, 8, 5) + (36, 8, 5), (83, 34) + (36, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/12

Epoch 11/128
EncDecLSTM_DL for Friuli Venezia Giulia with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
EncDecLSTM_DL for Campania with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
EncDecLSTM_DL for Marche with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
EncDecLSTM_DL for Basilicata with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(77,), (34,)
DBG: (77, 8, 5) + (34, 8, 5), (77, 34) + (34, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Ep

Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
EncDecLSTM_DL for Umbria with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
EncDecLSTM_DL for Piemonte with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128


Epoch 9/128
EncDecLSTM_DL for Toscana with lag (1-34): 2020-03-01 00:00:00 -> 2020-10-25 00:00:00
(84,), (37,)
DBG: (84, 8, 5) + (37, 8, 5), (84, 34) + (37, 34)
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128


In [76]:
train_dates_region_every_n = {}
test_dates_region_every_n = {}
train_dates_region = {}
test_dates_region = {}
df_ts_test_region = {}

# TODO aggiungere validation o no???
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

def test_in_interval(interval):
    print("TEST START")
                
    df_results = pd.DataFrame() # lag, region, prediction, target; index = date
    results_dict = []

    for region in regions_to_train:
        df_ts = df_test_prediction.loc[df_test_prediction.index.get_level_values(1)==region]
        df_ts = df_ts.reset_index().set_index('Date').drop(columns='Regione')
        df_ts = df_ts.loc[(df_ts.index>=interval[0])&(df_ts.index < interval[1])]
        first_date, last_date = df_ts.index.min(), df_ts.index.max()
        current_region_values = []
        
        test_dates_region[region] = pd.date_range(first_date, last_date)
        df_ts_test_region[region] = df_ts
        test_dates = test_dates_region[region].unique()
        assert test_dates_region[region].shape[0]==len(test_dates), "Something wrong"
        #for idx_lag, lag in enumerate(lags_target):
        #    for model_name in models_constructors.keys():
        #target_col = targets[idx_lag]
        #walk_forward_df = df_train_prediction.copy().drop(columns=[col for col in targets if (col not in features) and (col != target_col)]).dropna()

        current_df_train_prediction = df_train_prediction.copy().xs(region, level='Regione')
        walk_forward_df = current_df_train_prediction#.drop(columns=[col for col in targets if (col not in features)]).dropna()
        
        
        current_df_ts = df_ts[features+targets].copy().dropna()
        test_dates = current_df_ts.index
        
        # TODO continuare da qui: ...
        print("{} for {} with lag = ({}-{}): {} -> {}, {}".format(model_name, region, min(lags_target), max(lags_target), min(test_dates), max(test_dates), current_df_ts.shape))
        for i, t in enumerate(test_dates):
            current_df_ts = df_ts.loc[t:t+datetime.timedelta(days=0)]
            X_test_ts = current_df_ts[features]
            
            lstm_input = X_test_ts.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
            predictions = models_regions[region].predict(lstm_input).flatten()

            X_test_fit, y_test_fit = X_test_ts, current_df_ts[targets].values
            
            for i, p in enumerate(predictions):
                lag = lags_target[i]
                target_col = "target_{}".format(lag)
                target_val = y_test_fit[0, i]
                current_result = {"model": model_name,"date": t + datetime.timedelta(days=lag), "lag": lag, "region": region, "prediction": p, "target": target_val}
                results_dict.append(current_result)

            walk_forward_df = walk_forward_df.append(current_df_ts)
            walk_forward_df = walk_forward_df.sort_index()
            
            window_train = 64
            # I keep the last model and update weights using a window (last 30 values)
            model = models_regions[region]
            lstm_input = walk_forward_df[features].to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
            lstm_input = lstm_input[-1*window_train:]
            lstm_target = walk_forward_df[targets].values[-1*window_train:]
            model.fit(lstm_input, lstm_target, epochs=10, batch_size=8, verbose=0)
            models_regions[region] = model
            
    df_results = pd.DataFrame(results_dict)
    df_results = df_results.dropna()
    df_results.set_index(['model', 'date', 'region', 'lag'], inplace=True)
    df_results['error']=(df_results['prediction']-df_results['target']).abs()
    df_results['error_2'] = df_results['error']**2
    return models_regions, df_results

min_date, max_date = df_test_prediction.index.get_level_values(0).min(), df_test_prediction.index.get_level_values(0).max()
if do_train:
    models_regions, df_results = test_in_interval((min_date, max_date))
else:
    path_results = "{}predictions/results_v3.csv".format(saved)
    df_results = pd.read_csv(path_results)
    df_results['date'] = pd.to_datetime(df_results['date'])
    df_results.set_index(['model', 'date', 'region', 'lag'], inplace=True)

TEST START
EncDecLSTM_DL for Trentino-Alto Adige with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Puglia with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Emilia-Romagna with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Veneto with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Lombardia with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Abruzzo with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Molise with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Sicilia with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Valle d'Aosta with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:00:00, (81, 74)
EncDecLSTM_DL for Calabria with lag = (1-34): 2020-10-26 00:00:00 -> 2021-01-14 00:

In [77]:
def build_df_results_groupped(df_results, col_prediction = 'prediction', col_error = 'error_2', col_rmse = 'rmse', col_r2 = 'r2', col_mape = 'mape'):
    try:
        groupped_df = df_results.groupby(level=['model', 'region', 'lag'])
    except:
        print("WARNING: not groupped")
        groupped_df = df_results
    df = pd.DataFrame()
    df[col_rmse] = np.sqrt(groupped_df[col_error].mean())
    df[col_r2]=groupped_df.apply(lambda g: r2_score( g[col_prediction], g['target'] ))
    df[col_mape] = groupped_df.apply(lambda g: np.mean(np.abs((g['target'] - g['prediction']) / g['target'])) * 100)
    return df[[col_rmse, col_r2, col_mape]]

df_results_mean = build_df_results_groupped(df_results)

In [78]:
name_current_save = 'results_v5_LSTM_both'
# v14: da controllare, è con poly ma dopo aver sistemato la new
# v15: senza Rt (poly)
# v16: solo Rt (Poly)
path_results_to_save = "{}predictions/{}".format(saved, name_current_save)
path_results_to_save_models = "{}models/{}".format(saved, name_current_save)
path_results_to_mean_save = "{}_mean".format(path_results_to_save)
path_results_to_model_save = "{}_models".format(path_results_to_save_models)
path_results_to_save_train = "{}_df_train".format(path_results_to_save)
path_results_to_save_test = "{}_df_test".format(path_results_to_save)
path_results_to_save_unseen = "{}_df_unseen".format(path_results_to_save)
print("Saving results...")
df_results.to_csv("{}.csv".format(path_results_to_save))
print("Saving RMSE, R2, MAPE...")
df_results_mean.to_csv("{}.csv".format(path_results_to_mean_save))
print("Saving models...")
for region_name in models_regions.keys():
    #for lag in models_regions[model_name][region_name].keys():
    current_models_dir = '{}/{}/{}/'.format(path_results_to_model_save, model_name, region_name)
    
    current_model = models_regions[region_name]
    try:
        os.makedirs(current_models_dir)
    except:
        pass
    current_model.save("{}".format(current_models_dir))
# TODO salvare i dataset train, test, unseen
print("Saving datasets...")
df_train_prediction.to_csv("{}.csv".format(path_results_to_save_train))
df_test_prediction.to_csv("{}.csv".format(path_results_to_save_test))
df_unseen_prediction.to_csv("{}.csv".format(path_results_to_save_unseen))

print("OK!")

Saving results...
Saving RMSE, R2, MAPE...
Saving models...
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/results_v5_LSTM_both_models/EncDecLSTM_DL/Trentino-Alto Adige/assets
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/results_v5_LSTM_both_models/EncDecLSTM_DL/Puglia/assets
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/results_v5_LSTM_both_models/EncDecLSTM_DL/Emilia-Romagna/assets
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/results_v5_LSTM_both_models/EncDecLSTM_DL/Veneto/assets
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/results_v5_LSTM_both_models/EncDecLSTM_DL/Lombardia/assets
INFO:tensorflow:Assets written to: /Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/saved/models/

In [79]:
to_check = name_current_save
model_dir = name_current_save+"_models"#'results_v16_ma_poly_after_update_only_covid_models'


path_datasets = "{}predictions/{}".format(saved, to_check)
#path_train = "{}_df_train".format(path_datasets)
path_test = "{}_df_test.csv".format(path_datasets)
path_unseen = "{}_df_unseen.csv".format(path_datasets)

df_test_selected = pd.read_csv(path_test)
# TODO altro??
df_unseen_selected = pd.read_csv(path_unseen)



def init_models_dict(model_dir):
    path_results_dir = join(saved, 'models', model_dir)
    models = os.listdir(path_results_dir)
    models_dict = {}
    for model in models:
        models_dict[model] = {}
        model_path = join(path_results_dir, model)
        regions = os.listdir(model_path)
        for region in regions:
            models_dict[model][region]={}
            region_path = join(model_path, region)
            model_names = sorted([f for f in os.listdir(region_path) if isfile(join(region_path, f))], key=lambda f: int(f.split("_")[1].split(".")[0]))
            for model_name in model_names:
                n_lag = int(model_name.split("_")[1].split(".")[0])
                loaded_model = pickle.load(open(join(region_path, model_name), "rb" ))
                models_dict[model][region][n_lag]=loaded_model
    return models_dict

# reusing same model for prediction
# TODO usare dash per mostrare i 3 casi separati
region='Piemonte'
model_name = 'XGBoost'
is_xgb, is_poly3, is_poly2, is_rf, is_lstm = model_name=="XGBoost", model_name=="Poly3", model_name=="Poly2", model_name=="RandomForest", model_name=="EncDecLSTM"

current_models_regions = init_models_dict(model_dir)
model_per_target = current_models_regions[model_name][region]

farsightness = max(model_per_target.keys())

num_samples_retrain = 120
if is_xgb or is_rf:
    num_samples_retrain = 120
elif is_lstm:
    num_samples_retrain = 80
    
last_date_seen = df_test_selected.index.get_level_values('Date').max()
start_date_unseen = df_test_selected+datetime.timedelta(days=1)

current_region_df = df_unseen_selected.xs(region, level='Regione')
current_region_test_df = df_test_selected.xs(region, level='Regione')

current_train, current_unseen = current_region_test_df, current_region_df

# take only last num_samples_retrain
current_train = current_train.iloc[-num_samples_retrain:]

features_train = current_train[features_selected]
target_series_train = {int(col.split("_")[1]):current_train[col] for col in current_train.columns if "target" in col}
features_unseen = current_unseen[features_selected]
#target_series_unseen = {int(col.split("_")[0]):current_unseen[col] for col in current_unseen.columns if "target" in col}

num_features_t = len([col for col in features_selected if not any(char.isdigit() for char in col)])
n_timesteps = features_train.shape[1]//num_features_t


dates = features_unseen.index
num_dates= len(dates)
predictions, index_predictions = [], []
idx_dates = list(range(0, num_dates, farsightness))
dates_to_return = dates[idx_dates]

fig = go.Figure()

steps_ahead = [f for f in model_per_target.keys() if f <= farsightness]

n_timesteps = 8
num_features_t = len(trafficKPIs)+len(covidKPIs)

min_y, max_y = float("+inf"), float("-inf")

for idx, idx_d in enumerate(idx_dates):
    d = dates[idx_d]
    feature_current = features_unseen.loc[d:d]#.to_numpy()
    #print(feature_current)
    if 'LSTM' in model_name:
        feature_current = feature_current.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
    #print(feature_current.index)
    for step_ahead in steps_ahead:
        current_model = model_per_target[step_ahead]
        #print(feature_current.shape)
        current_prediction = current_model.predict(feature_current.values)
        #print(step_ahead)
        if 'LSTM' in model_name:
            try:
                current_prediction = current_prediction[-1][-1][-1]
            except:
                current_prediction = current_prediction[-1][-1]
            predictions.append(current_prediction)
        else:
            current_prediction = current_prediction[-1]
            predictions.append(current_prediction)
        #print("{}: {}".format(step_ahead, current_prediction))
        index_predictions.append(d + datetime.timedelta(days=step_ahead))
    # TODO retrain!!! Simulate that
    if idx < len(idx_dates)-1:
        print("RETRAIN!!!")
        # I can retrain
        next_d = d+datetime.timedelta(days=steps_ahead[-1])
        current_train_df = current_region_df.loc[:next_d]
        
        current_train_df = current_train_df.iloc[-num_samples_retrain:]
        current_features_train = current_train_df[features_selected]
        
        for key in model_per_target.keys():
            model = model_per_target[key]
            features_at_day_d = current_region_df.loc[:d]
            target_series_train = {int(col.split("_")[1]):current_train_df[col] for col in current_train_df.columns if "target" in col}
            current_features_train, target_series_train[key] = current_features_train.dropna(), target_series_train[key].dropna()
            common_idxs = current_features_train.index.intersection(target_series_train[key].index)
            current_features_train = current_features_train.loc[common_idxs]
            target_series_train[key] = target_series_train[key].loc[common_idxs]
            if is_lstm:
                
                size_int = int(current_features_train.shape[0]*0.7)
                current_features_train_1, current_features_train_2 = current_features_train.iloc[:size_int], current_features_train.iloc[size_int:]
                current_target_train, current_target_val = target_series_train[key].loc[current_features_train_1.index], target_series_train[key].loc[current_features_train_2.index]
                lstm_input_train = current_features_train_1.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
                lstm_input_val = current_features_train_2.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')

                history = model.fit(x=lstm_input_train, y=current_target_train, epochs=MAX_EPOCHS, verbose=0,
                                  validation_data=(lstm_input_val, current_target_val),
                                  callbacks=[early_stopping])
                
                #callback = EarlyStopping(monitor='loss', patience=3)
                #model.fit(lstm_input, target_series_train[key], epochs=100, batch_size=32, verbose=0, callbacks=[callback])
            else:
                model.fit(current_features_train.values, target_series_train[key])

series_predictions = pd.Series(data=predictions, index=index_predictions)
series_target = df_unseen_covid.xs(region, level='Regione').loc[start_date_unseen:]['R_mean']


min_y = min(series_predictions.min(), series_target.min())
max_y = max(series_predictions.max(), series_target.max())

for d in [dates[i] for i in idx_dates]:
    fig.add_trace(go.Scatter(
        line = dict(color="black", width=2, dash='dash'),
        marker_size=1,
        x=[d, d],
        y=[min_y, max_y],
        showlegend=False
    ))       

#series_target.plot()
#series_predictions.plot()


fig.add_trace(go.Scatter(
        line = dict(color="orange", width=1),
        marker_size=1,
        x=series_predictions.index,
        y=series_predictions,
        showlegend=False
    ))  

fig.add_trace(go.Scatter(
        line = dict(color="red", width=1),
        marker_size=1,
        x=series_target.index,
        y=series_target,
        showlegend=False
    ))

ValueError: invalid literal for int() with base 10: 'model'

In [None]:
# qui si visualizza il corrente
region='Lombardia'
model_name = 'Poly2'
is_xgb, is_poly3, is_poly2, is_rf, is_lstm = model_name=="XGBoost", model_name=="Poly3", model_name=="Poly2", model_name=="RandomForest", model_name=="EncDecLSTM"

num_samples_retrain = 45
if is_xgb or is_rf:
    num_samples_retrain = 60
elif is_lstm:
    num_samples_retrain = 80

last_date_seen = pd.to_datetime('2021-01-15')
start_date_unseen = last_date_seen+datetime.timedelta(days=1)

current_region_df = df_test_prediction.xs(region, level='Regione')

current_train, current_unseen = current_region_df.loc[:last_date_seen], current_region_df.loc[start_date_unseen:]

# take only last num_samples_retrain
current_train = current_train.iloc[-num_samples_retrain:]

features_train = current_train[features]
target_series_train = {int(col.split("_")[1]):current_train[col] for col in current_train.columns if "target" in col}
features_unseen = current_unseen[features]
#target_series_unseen = {int(col.split("_")[0]):current_unseen[col] for col in current_unseen.columns if "target" in col}

num_features_t = len(trafficKPIs)
n_timesteps = features_train.shape[1]//num_features_t

In [None]:
features_train

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)

if is_poly2:
    model_per_target = {key: make_pipeline(PolynomialFeatures(2), Ridge()) for key in target_series_train.keys()}
elif is_poly3:
    model_per_target = {key: make_pipeline(PolynomialFeatures(3), Ridge()) for key in target_series_train.keys()}
elif is_rf:
    model_per_target = {key: RandomForestRegressor() for key in target_series_train.keys()}
elif is_xgb:
    model_per_target = {key: XGBRegressor() for key in target_series_train.keys()}
elif is_lstm:
    model_per_target = {key: build_lstm_3(n_timesteps, num_features_t) for key in target_series_train.keys()}
for key in model_per_target.keys():
    model = model_per_target[key]
    current_features_train = features_train.copy()
    current_features_train, target_series_train[key] = current_features_train.dropna(), target_series_train[key].dropna()
    current_features_train = current_features_train.loc[target_series_train[key].index]
    target_series_train[key] = target_series_train[key].loc[current_features_train.index]
    if is_lstm:
        size_int = int(current_features_train.shape[0]*0.7)
        current_features_train_1, current_features_train_2 = current_features_train.iloc[:size_int], current_features_train.iloc[size_int:]
        current_target_train, current_target_val = target_series_train[key].loc[current_features_train_1.index], target_series_train[key].loc[current_features_train_2.index]
        lstm_input_train = current_features_train_1.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
        lstm_input_val = current_features_train_2.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
        
        model.compile(loss=tf.losses.MeanSquaredError(),
                    optimizer=tf.optimizers.Adam(learning_rate=1e-3),
                    metrics=[tf.metrics.MeanAbsoluteError()])
        
        MAX_EPOCHS = 128
        
        history = model.fit(x=lstm_input_train, y=current_target_train, epochs=MAX_EPOCHS,
                          validation_data=(lstm_input_val, current_target_val),
                          callbacks=[early_stopping])
        
    else:
        model.fit(current_features_train, target_series_train[key])

In [None]:
farsightness = max(model_per_target.keys())

dates = features_unseen.index
num_dates= len(dates)
predictions, index_predictions = [], []
idx_dates = list(range(0, num_dates, farsightness))
dates_to_return = dates[idx_dates]

fig = go.Figure()

steps_ahead = [f for f in model_per_target.keys() if f <= farsightness]

n_timesteps = 8
num_features_t = len(trafficKPIs)

min_y, max_y = float("+inf"), float("-inf")

use_already_trained = True
if use_already_trained:
    model_per_target = models_regions['Poly1']['Lombardia']

for idx, idx_d in enumerate(idx_dates):
    d = dates[idx_d]
    feature_current = features_unseen.loc[d:d]#.to_numpy()
    #print(feature_current)
    if 'LSTM' in model_name:
        feature_current = feature_current.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
    #print(feature_current.index)
    for step_ahead in steps_ahead:
        current_model = model_per_target[step_ahead]
        #print(feature_current.shape)
        current_prediction = current_model.predict(feature_current)
        #print(step_ahead)
        if 'LSTM' in model_name:
            try:
                current_prediction = current_prediction[-1][-1][-1]
            except:
                current_prediction = current_prediction[-1][-1]
            predictions.append(current_prediction)
        else:
            current_prediction = current_prediction[-1]
            predictions.append(current_prediction)
        #print("{}: {}".format(step_ahead, current_prediction))
        index_predictions.append(d + datetime.timedelta(days=step_ahead))
    # TODO retrain!!! Simulate that
    if idx < len(idx_dates)-1:
        print("RETRAIN!!!")
        # I can retrain
        next_d = d+datetime.timedelta(days=steps_ahead[-1])
        current_train_df = current_region_df.loc[:next_d]
        
        current_train_df = current_train_df.iloc[-num_samples_retrain:]
        current_features_train = current_train_df[features]
        
        for key in model_per_target.keys():
            model = model_per_target[key]
            features_at_day_d = current_region_df.loc[:d]
            target_series_train = {int(col.split("_")[1]):current_train_df[col] for col in current_train_df.columns if "target" in col}
            current_features_train, target_series_train[key] = current_features_train.dropna(), target_series_train[key].dropna()
            current_features_train = current_features_train.loc[target_series_train[key].index]
            target_series_train[key] = target_series_train[key].loc[current_features_train.index]
            if is_lstm:
                
                size_int = int(current_features_train.shape[0]*0.7)
                current_features_train_1, current_features_train_2 = current_features_train.iloc[:size_int], current_features_train.iloc[size_int:]
                current_target_train, current_target_val = target_series_train[key].loc[current_features_train_1.index], target_series_train[key].loc[current_features_train_2.index]
                lstm_input_train = current_features_train_1.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')
                lstm_input_val = current_features_train_2.to_numpy().reshape(-1, n_timesteps, num_features_t, order='C')

                history = model.fit(x=lstm_input_train, y=current_target_train, epochs=MAX_EPOCHS, verbose=0,
                                  validation_data=(lstm_input_val, current_target_val),
                                  callbacks=[early_stopping])
                
                #callback = EarlyStopping(monitor='loss', patience=3)
                #model.fit(lstm_input, target_series_train[key], epochs=100, batch_size=32, verbose=0, callbacks=[callback])
            else:
                model.fit(features_train_df, target_series_train[key])     

series_predictions = pd.Series(data=predictions, index=index_predictions)
series_target = df_covid_predictions.xs(region, level='Regione').loc[start_date_unseen:]['R_mean']


min_y = min(series_predictions.min(), series_target.min())
max_y = max(series_predictions.max(), series_target.max())

for d in [dates[i] for i in idx_dates]:
    fig.add_trace(go.Scatter(
        line = dict(color="black", width=2, dash='dash'),
        marker_size=1,
        x=[d, d],
        y=[min_y, max_y],
        showlegend=False
    ))       

#series_target.plot()
#series_predictions.plot()

fig.add_trace(go.Scatter(
        line = dict(color="orange", width=1),
        marker_size=1,
        x=series_predictions.index,
        y=series_predictions,
        showlegend=False
    ))  

fig.add_trace(go.Scatter(
        line = dict(color="red", width=1),
        marker_size=1,
        x=series_target.index,
        y=series_target,
        showlegend=False
    ))