In [5]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from jupyter_dash import JupyterDash

import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import os
from plotly.subplots import make_subplots
from datetime import date, datetime
import geojson
import copy

# TODO 1) more KPIs for covid X (indice RT???)
data_path = "/Users/filipkrasniqi/Documents/Datasets.tmp/traffic-covid/"
saved = "{}saved/".format(data_path)
traffic_daily = "{}TS_1800_daily.pkl".format(saved)
covid = "{}covid/".format(data_path)
covid_daily = "{}covid_rt.csv".format(saved)#"{}covid_regioni.csv".format(covid)
region_traffic_daily = "{}all.pkl".format(saved)
by_region_path = "{}By_Region/".format(data_path)

covid_cols = ['Date', 'Regione', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri', 'totale_casi', 'deceduti', 'totale_casi_giornalieri', 'terapia_intensiva_giornalieri', 'R_mean']
traffic_cols = ['Date', 'Regione', 'Hin_Succ', 'DL_VOL', 'UL_VOL', 'USERNUM_AVG']
new_traffic_cols = ['Date', 'Regione', 'Handover', 'Download vol.', 'Upload vol.', '#Users']
regions_rename_traffic = {'Abruzzi': 'Abruzzo', 'Emilia-romagna': 'Emilia Romagna', \
                          'Friuli-venezia giulia': 'Friuli Venezia Giulia', 'Valle d`aosta': "Valle d'Aosta",\
                         'Trentino-alto adige': "Trentino-Alto Adige"}

to_sum_KPIs = ['totale_casi_giornalieri', 'terapia_intensiva_giornalieri', 'terapia_intensiva', 'nuovi_positivi', 'tamponi_giornalieri', 'R_mean']
covidKPIs = ['R_mean']+to_sum_KPIs
trafficKPIs = ['Handover', 'Download vol.', 'Upload vol.', '#Users']

# controllo se ...
NEW_DATA, FORCE = True, True
if NEW_DATA:
    if not FORCE and os.path.isfile(region_traffic_daily):
        df_traffic_daily = pd.read_pickle(region_traffic_daily)
    else: 
        df_traffic_daily = pd.read_pickle(traffic_daily)
        # read new data
        _, _, filenames = next(os.walk(by_region_path))
        dfs = []
        for filename in filenames:
            if "LTE_1800" in filename:
                try:
                    df = pd.read_pickle("{}{}".format(by_region_path, filename))
                    df['Regione'] = filename.split("_")[2]
                    df.USERNUM_AVG = pd.to_numeric(df.USERNUM_AVG, errors='coerce')
                    dfs.append(df)
                except:
                    print("ERRORE: {}".format(filename))
                
        df_traffic_daily = pd.concat(dfs)
        df_traffic_daily.Regione = df_traffic_daily.Regione.apply(lambda x: x.capitalize())

        for col_old, col_new in zip(traffic_cols, new_traffic_cols):
            df_traffic_daily = df_traffic_daily.rename(columns={col_old: col_new})
        
        for key in regions_rename_traffic.keys():
            df_traffic_daily.loc[df_traffic_daily.Regione == key, "Regione"] = regions_rename_traffic[key]
        
        df_traffic_daily.to_pickle(region_traffic_daily)
else:
    df_traffic_daily = pd.read_pickle(traffic_daily)
    map_city_regione = {"MILANO": "Lombardia", "BERGAMO": "Lombardia", "NAPOLI": "Campania", "ROMA": "Lazio", "TORINO": "Piemonte"}

    columns = ['data', 'denominazione_regione','denominazione_provincia','sigla_provincia','lat','long','totale_casi']
    if "Regione" not in df_traffic_daily.columns:
        # assegno regioni
        df_traffic_daily['Regione'] = df_traffic_daily['COMUNE'].apply(lambda comune: map_city_regione[comune])
    
df_covid = pd.read_csv(covid_daily)

# sums regions such as trento + bolzano
def sumRegions(df, dateCol = 'Date', regionCol='Regione', cols = to_sum_KPIs, region1 = "Bolzano", region2 = "Trento", regionNew = "Trentino-Alto Adige"):
    dfRegion1, dfRegion2 = df.loc[df[regionCol] == region1], df.loc[df[regionCol] == region2]
    dfRegion1.set_index(dateCol, inplace=True)
    dfRegion2.set_index(dateCol, inplace=True)
    newVals = dfRegion1[to_sum_KPIs]+dfRegion2[to_sum_KPIs]
    newVals.reset_index(inplace=True)
    newVals['Regione'] = regionNew
    df = df.loc[(df[regionCol] != region1) & (df[regionCol] != region2)]
    return df.append(newVals)

# adds italy as cumulative over days
def addItalyData(df, cols):
    dfTemp = df.resample('D', on='Date').sum().reset_index()
    dfTemp['Regione']='Italia'
    dfTemp = dfTemp[cols]
    return pd.concat([df, dfTemp])

def map_geojson(region):
    regions_map = {"Valle d'Aosta/Vallée d'Aoste": 'Valle d\'Aosta', "Trentino-Alto Adige/Südtirol": 'Trentino-Alto Adige',\
                  "Friuli-Venezia Giulia": "Friuli Venezia Giulia", "Emilia-Romagna": "Emilia Romagna"}
    return regions_map[region]

# data from string to datetime
try:
    df_covid.data = pd.to_datetime(df_covid.data)
    df_covid.rename(columns={'data': 'Date'}, inplace=True)
except:
    print("ALREADY OK")
    
df_covid = sumRegions(df_covid)

traffic_cols = new_traffic_cols

ALREADY OK


In [6]:
path_traffic_predictions="{}predictions/traffic.pkl".format(saved)
path_traffic_predictions_csv="{}predictions/traffic.csv".format(saved)
df_traffic_predictions_temp = pd.read_pickle(path_traffic_predictions)
df_traffic_predictions_temp.to_csv(path_traffic_predictions_csv)

path_covid_predictions="{}predictions/covid.pkl".format(saved)
path_covid_predictions_csv="{}predictions/covid.csv".format(saved)
df_covid_predictions = pd.read_pickle(path_covid_predictions)
df_covid_predictions.to_csv(path_covid_predictions_csv)

In [9]:
df_traffic_predictions_temp

Unnamed: 0_level_0,Unnamed: 1_level_0,RRC_S_Succ,RRC_S_Att,RRC_S_SR,RRC_RE_Succ,RRC_RE_Att,IntraF_Hout_Succ,IntraF_Hout_Att,InterF_Hout_Succ,InterF_Hout_Att,Handover,...,InterR_HO_OUT_E2G_Att,Download vol.,Upload vol.,#Users,ERAB_S_Succ,ERAB_S_Att,Handover_MA,Download vol._MA,Upload vol._MA,#Users_MA
Date,Regione,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-01-01,Abruzzo,722914109.0,723559625.0,2399.867376,2109077.0,2755617.0,69768501.0,69410898.0,29058705.0,29152448.0,87240956.0,...,41672.0,2.892451e+15,2.471538e+14,3.184160e+06,,,,,,
2020-01-02,Abruzzo,718452155.0,718983916.0,2399.873855,2131711.0,2792486.0,84230949.0,83904966.0,28343453.0,28429081.0,103161979.0,...,54111.0,2.675922e+15,2.237421e+14,3.220258e+06,,,,,,
2020-01-03,Abruzzo,732998734.0,733541088.0,2399.825795,2155915.0,2846211.0,86249305.0,85912857.0,29106383.0,29189094.0,105738844.0,...,55830.0,2.691140e+15,2.226775e+14,3.271108e+06,,,,,,
2020-01-04,Abruzzo,729004305.0,729571151.0,2399.844823,2116294.0,2829792.0,84713582.0,84381441.0,29279339.0,29355363.0,104350340.0,...,47662.0,2.647715e+15,2.187153e+14,3.189687e+06,,,,,,
2020-01-05,Abruzzo,706629792.0,707240933.0,2399.768004,2272802.0,2936730.0,79952197.0,79642735.0,28033160.0,28114250.0,98488811.0,...,35454.0,2.735595e+15,2.249541e+14,3.108692e+06,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-24,Veneto,225596711.0,225693675.0,2400.000000,534582.0,768383.0,7189438.0,7223154.0,8989396.0,9086800.0,24560829.0,...,22754.0,1.227038e+12,1.239326e+11,1.148751e+06,406781583.0,406840893.0,2.263051e+07,1.268557e+12,1.193639e+11,1.108370e+06
2021-02-25,Veneto,226892106.0,226990072.0,2400.000000,544697.0,778281.0,7290843.0,7323795.0,9044861.0,9143584.0,25162624.0,...,22194.0,1.265358e+12,1.239708e+11,1.155531e+06,408781539.0,408834867.0,2.281185e+07,1.273790e+12,1.198096e+11,1.111068e+06
2021-02-26,Veneto,228933364.0,229031967.0,2400.000000,534915.0,771242.0,7482217.0,7517542.0,9205983.0,9307026.0,25735694.0,...,21704.0,1.294111e+12,1.227355e+11,1.155699e+06,412255006.0,412310286.0,2.297402e+07,1.281229e+12,1.200135e+11,1.113032e+06
2021-02-27,Veneto,214647857.0,214744605.0,2400.000000,462945.0,684452.0,6555173.0,6585317.0,8301996.0,8401086.0,21873387.0,...,13830.0,1.317346e+12,1.129490e+11,1.054649e+06,386189271.0,386234427.0,2.314916e+07,1.283160e+12,1.205004e+11,1.114277e+06


In [7]:
df_td_regione = df_traffic_daily.groupby('Regione').resample('D', on='Date').sum().reset_index()

In [8]:
df_td_regione['Date']

0      2020-01-01
1      2020-01-02
2      2020-01-03
3      2020-01-04
4      2020-01-05
          ...    
8495   2021-02-24
8496   2021-02-25
8497   2021-02-26
8498   2021-02-27
8499   2021-02-28
Name: Date, Length: 8500, dtype: datetime64[ns]

## Filter in the wanted range of dates

In [203]:
df_traffic_daily_PO, df_covid_PO = df_td_regione.loc[(df_td_regione.Date >= '2020-03-01') & (df_td_regione.Date <= '2020-07-31')], df_covid.loc[(df_covid.Date >= '2020-03-01') & (df_covid.Date <= '2020-08-01')]

In [204]:
df_traffic_daily_SO, df_covid_SO = df_td_regione.loc[(df_td_regione.Date >= '2020-09-01') & (df_td_regione.Date <= '2020-12-31')], df_covid.loc[(df_covid.Date >= '2020-09-01') & (df_covid.Date <= '2021-01-01')]

## Filter covid data for available traffic regions

In [226]:
regions = set(df_traffic_daily_SO.Regione.unique()).intersection(df_traffic_daily_PO.Regione.unique())
print(regions)
def filterByTrafficRegions(df_covid, regions):
    df_covid.rename(columns={'denominazione_regione': 'Regione'}, inplace=True)
    query = ' | '.join([f'Regione=="{r}"' for r in regions])
    return df_covid.query(query)

df_covid_PO, df_covid_SO = filterByTrafficRegions(df_covid_PO, regions), filterByTrafficRegions(df_covid_SO, regions)

{"Valle d'Aosta", 'Veneto', 'Toscana', 'Piemonte', 'Molise', 'Abruzzo', 'Puglia', 'Marche', 'Basilicata', 'Sardegna', 'Trentino-Alto Adige', 'Lombardia', 'Sicilia', 'Liguria', 'Calabria', 'Emilia Romagna', 'Campania', 'Umbria', 'Lazio', 'Friuli Venezia Giulia'}


In [206]:
df_covid_PO['Date'], df_covid_SO['Date'] = pd.to_datetime(df_covid_PO['Date']), pd.to_datetime(df_covid_SO['Date'])

## Selecting columns and adding cumulative data

In [207]:
'''
df_traffic_daily_SO, df_covid_SO, df_traffic_daily_PO, df_covid_PO = df_traffic_daily_SO[traffic_cols],\
    df_covid_SO[covid_cols], df_traffic_daily_PO[traffic_cols], df_covid_PO[covid_cols]

def addItalyData(df, cols, traffic = False, aggr = None):
    if aggr is None:
        aggr = {col: 'sum' for col in cols if "mean" not in col}#, 'tamb': np.mean}
        if not traffic:
            aggr['R_mean'] = 'mean'
        
    dfTemp = df.resample('D', on='Date').agg(aggr).reset_index()
    dfTemp['Regione']='Italia'
    dfTemp = dfTemp[cols]
    df = df.reset_index()
    return pd.concat([df, dfTemp])

df_traffic_daily_PO, df_traffic_daily_SO = addItalyData(df_traffic_daily_PO, trafficKPIs, traffic=True), \
    addItalyData(df_traffic_daily_SO, trafficKPIs, traffic=True)
df_covid_PO, df_covid_SO = addItalyData(df_covid_PO, covidKPIs),\
    addItalyData(df_covid_SO, covidKPIs)
    '''

'\ndf_traffic_daily_SO, df_covid_SO, df_traffic_daily_PO, df_covid_PO = df_traffic_daily_SO[traffic_cols],    df_covid_SO[covid_cols], df_traffic_daily_PO[traffic_cols], df_covid_PO[covid_cols]\n\ndef addItalyData(df, cols, traffic = False, aggr = None):\n    if aggr is None:\n        aggr = {col: \'sum\' for col in cols if "mean" not in col}#, \'tamb\': np.mean}\n        if not traffic:\n            aggr[\'R_mean\'] = \'mean\'\n        \n    dfTemp = df.resample(\'D\', on=\'Date\').agg(aggr).reset_index()\n    dfTemp[\'Regione\']=\'Italia\'\n    dfTemp = dfTemp[cols]\n    df = df.reset_index()\n    return pd.concat([df, dfTemp])\n\ndf_traffic_daily_PO, df_traffic_daily_SO = addItalyData(df_traffic_daily_PO, trafficKPIs, traffic=True),     addItalyData(df_traffic_daily_SO, trafficKPIs, traffic=True)\ndf_covid_PO, df_covid_SO = addItalyData(df_covid_PO, covidKPIs),    addItalyData(df_covid_SO, covidKPIs)\n    '

In [208]:
df_covid_PO['%pos'] = (df_covid_PO['nuovi_positivi']/df_covid_PO['tamponi_giornalieri'])
df_covid_SO['%pos'] = (df_covid_SO['nuovi_positivi']/df_covid_SO['tamponi_giornalieri'])

In [209]:
# handle wrong and missing values (NaN or >= 1) -> ((t-1)+(t+1))/2
def fixMissingCovid(df, col="%pos"):
    idxToSelect = (df[col].isna()) | (df[col] >= 1) | (df[col] < 0)
    df.loc[idxToSelect, col] = np.nan
    df[col] = df[col].interpolate(method='linear')
    df = df.loc[~df[col].isna()]  # to handle first values
    return df
    
df_covid_PO, df_covid_SO = fixMissingCovid(df_covid_PO), fixMissingCovid(df_covid_SO)

In [210]:
regionsToShow = list(regions)
#regionsToShow.append("Italia")

name = "TRAFFIC_VS_COVID"
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app_timeseries = JupyterDash(name, external_stylesheets=external_stylesheets)

app_timeseries.layout = html.Div([
html.Label(
    
[
    "Day/Week",
    dcc.Dropdown(id="dayOrWeek",
                 options=[{"label": x, "value": x} for x in ['Day', 'Week']],
                value='Day',
                clearable=False)
]),
    html.Label(
        [
            "Date Picker: ",
            dcc.DatePickerRange(
                id='date-picker-range',
                min_date_allowed=date(2020, 3, 1),
                max_date_allowed=date(2020, 12, 31),
                start_date=date(2020, 3, 1),
                end_date=date(2021, 12, 31)
            ),
        ]
    ),
html.Label(
    [
        "TRAFFIC",
        dcc.Dropdown(id="kpi1",
                     options=[{"label": x, "value": x} for x in trafficKPIs],
                    value=trafficKPIs[0],
                    clearable=False)
    ]),
html.Label(
    [
        "COVID",
        dcc.Dropdown(id="kpi2",
                     options=[{"label": x, "value": x} for x in covidKPIs],
                    value=covidKPIs[0],
                    clearable=False)
    ]),
html.Label(
    [
        "Regione",
        dcc.Dropdown(id="regions",
                     options=[{"label": x, "value": x} for x in regionsToShow],
                    value="Lombardia",
                    multi=True,
                    clearable=True)
    ],
),
    html.Label(
                ["Rolling average window ",
                html.Br(),
                dcc.Input(
                id='rolling_avg',
                type='number',
#                 value=1
                )]
    ),
    html.Label(
                ["Shift COVID",
                html.Br(),
                dcc.Input(
                id='shift',
                type='number',
                )]
    ),
    html.Br(),
html.Label(
    [
        "Scatterplot",
        dcc.Dropdown(id="scatterplot",
                     options=[{"label": x, "value": x} for x in ["Yes", "No"]],
                    value="No",
                    clearable=False)
    ]),
html.Div(dcc.Graph(id=name))])

@app_timeseries.callback(
Output(name, "figure"), 
[
     Input('date-picker-range', 'start_date'),
     Input('date-picker-range', 'end_date'),Input("dayOrWeek", "value"), Input("kpi1", "value"), Input("kpi2", "value"), Input("regions", "value"), Input("rolling_avg", "value"), Input("shift", "value"), Input("scatterplot", "value")])
def display_map_period(start_date, end_date, dayOrWeek, trafficKPI, covidKPI, regions, roll_avg, shift_amount, scatter):
    
    if start_date is not None:
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    if end_date is not None:
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    if dayOrWeek != "Day":
        df_covid_1, df_covid_2 = df_covid_PO.groupby('Regione').resample('W-Mon', on='Date').sum().reset_index(),\
            df_covid_SO.groupby('Regione').resample('W-Mon', on='Date').sum().reset_index()
        df_traffic_1, df_traffic_2 = df_traffic_daily_PO.groupby('Regione').resample('W-Mon', on='Date').sum().reset_index(), df_traffic_daily_SO.groupby('Regione').resample('W-Mon', on='Date').sum().reset_index()
    else:
        df_covid_1, df_covid_2 = df_covid_PO, df_covid_SO
        df_traffic_1, df_traffic_2 = df_traffic_daily_PO, df_traffic_daily_SO
        
    df_covid_2 = df_covid_2.loc[df_covid_2.Date > df_traffic_2.Date.min()]

    df_traffic_1[df_traffic_1[trafficKPI] <= 0], df_traffic_2[df_traffic_2[trafficKPI] <= 0] = np.nan, np.nan
    df_traffic_1.dropna(subset=[trafficKPI], inplace=True)
    df_traffic_2.dropna(subset=[trafficKPI], inplace=True)
    
    if isinstance(regions, str):
        regions = [regions]
        
    if regions is None:
        regions = ['Italia']
        
    if roll_avg is None or roll_avg <= 0:
        roll_avg = 1
        
    if shift_amount is None:
        shift_amount = 0

    if scatter == "Yes":
        fig = make_subplots(specs=[[{"secondary_y": False}]])
        fig.update_xaxes(title_text=trafficKPI)
        fig.update_yaxes(title_text=covidKPI)
        
    else:
        fig = make_subplots(specs=[[{"secondary_y": True}]])
        
        # Set x-axis title
        fig.update_xaxes(title_text="Day of year")

        # Set y-axes titles
        fig.update_yaxes(title_text=trafficKPI, secondary_y=False)
        fig.update_yaxes(title_text=covidKPI, secondary_y=True)

    traffic_colors = ['#FF0000', '#f56342', '#f57e42', '#f59642', '#f5a742']
    covid_colors = ['#0000FF', '#0062ff', '#008cff', '#00b3ff', '#00b7e0']
    
    def filter_fun(df):
        return (df['Date'] >= start_date) & (df['Date'] <= end_date)
    
    covidKPI2, trafficKPI2 = covidKPI, trafficKPI
    
    for i, r in enumerate(regions):
        df_traffic_region_1, df_traffic_region_2 = df_traffic_1.loc[df_traffic_1.Regione == r], df_traffic_2.loc[df_traffic_2.Regione == r]
        df_covid_region_1, df_covid_region_2 = df_covid_1.loc[df_covid_1.Regione == r], df_covid_2.loc[df_covid_2.Regione == r]
        df_traffic_region_1, df_traffic_region_2 = df_traffic_region_1.loc[filter_fun(df_traffic_region_1)], df_traffic_region_2.loc[filter_fun(df_traffic_region_2)]
        df_covid_region_1, df_covid_region_2 = df_covid_region_1.loc[filter_fun(df_covid_region_1)], df_covid_region_2.loc[filter_fun(df_covid_region_2)]
        if shift_amount != 0:
            df_covid_region_1, df_covid_region_2 = df_covid_region_1.shift(shift_amount), df_covid_region_2.shift(shift_amount)
            # sommo N giorni a tutti nel date
            df_covid_region_1['Date'] += pd.DateOffset(shift_amount)
            df_covid_region_2['Date'] += pd.DateOffset(shift_amount)
            
        if roll_avg > 1:
            covidKPI2, trafficKPI2 = "{}_roll".format(covidKPI), "{}_roll".format(trafficKPI)
            df_traffic_region_1[trafficKPI2], df_traffic_region_2[trafficKPI2], df_covid_region_1[covidKPI2], df_covid_region_2[covidKPI2] = \
                df_traffic_region_1[trafficKPI].rolling(roll_avg).mean(), df_traffic_region_2[trafficKPI].rolling(roll_avg).mean(), \
                df_covid_region_1[covidKPI].rolling(roll_avg).mean(), df_covid_region_2[covidKPI].rolling(roll_avg).mean()
        #df_traffic_region_1.dropna(subset=[trafficKPI2], inplace=True)
        #df_traffic_region_2.dropna(subset=[trafficKPI2], inplace=True)
        #df_covid_region_1.dropna(subset=[covidKPI2], inplace=True)
        #df_covid_region_2.dropna(subset=[covidKPI2], inplace=True)
        def normalize(df, col):
            return (df[col]-df[col].mean())/df[col].std()
        df_traffic_all, df_covid_all = pd.concat([df_traffic_region_1, df_traffic_region_2]), pd.concat([df_covid_region_1, df_covid_region_2])
        df_covid_all[covidKPI2] = normalize(df_covid_all, covidKPI2)
        df_traffic_all[trafficKPI2] = normalize(df_traffic_all, trafficKPI2)
        
        if scatter == "Yes":
            fig.add_trace(
                    go.Scatter(
                    x=df_traffic_all[trafficKPI2],
                    y=df_covid_all[covidKPI2],
                    mode='markers+text',
                    name="{} vs {} - {}".format(trafficKPI, covidKPI, r)
                )
            )
            #fig.update_yaxes(range=[0, 1])
        else:
            fig.add_trace(
                    go.Scatter(
                    x=df_traffic_region_1.Date,
                    y=df_traffic_region_1[trafficKPI2],
                    marker=dict(
                        color=traffic_colors[i]
                    ),
                    name="{} - {}".format(trafficKPI, r)
                ),
                secondary_y=False,
            )
            fig.add_trace(
                    go.Scatter(
                    x=df_traffic_region_2.Date,
                    y=df_traffic_region_2[trafficKPI2],
                    marker=dict(
                        color=traffic_colors[i]
                    ),
                    showlegend=False,
                    name="{} - {}".format(trafficKPI, r)
                ),
                secondary_y=False,
            )
            fig.add_trace(
                    go.Scatter(
                    x=df_covid_region_1.Date,
                    y=df_covid_region_1[covidKPI2],
                    marker=dict(
                        color=covid_colors[i]
                    ),
                    name="{} - {}".format(covidKPI, r),
                ),
                secondary_y=True,
            )
            fig.add_trace(
                    go.Scatter(
                    x=df_covid_region_2.Date,
                    y=df_covid_region_2[covidKPI2],
                    marker=dict(
                        color=covid_colors[i]
                    ),
                    showlegend=False,
                    name="{} - {}".format(covidKPI, r)
                ),
                secondary_y=True,
            )
    
    # Add figure title
    fig.update_layout(
        title_text="{} vs {}".format(trafficKPI, covidKPI)
    )

    # fig = fill_with_areas(selected_data_groupped['Date'], dateRange, fig)

    return fig

#app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
app_timeseries.run_server(mode='inline', port=26000) # debug=True, use_reloader=False

In [211]:
map_token='pk.eyJ1IjoiZnJhbmNpZ2plY2kiLCJhIjoiY2tpazZveWhmMDZ5MzMxcWp4bzIxbm0wYyJ9.J_qWOJqADI6tZfle2bbZFg'
df_covid_all = pd.concat([df_covid_PO, df_covid_SO])
df_traffic_all = pd.concat([df_traffic_daily_PO, df_traffic_daily_SO])

In [212]:
path ='{}{}'.format(data_path, 'covid/regioni.geojson')
with open(path) as f:
    json_data = geojson.load(f)

In [213]:
regions_to_filter = df_covid_all.loc[df_covid_all.Regione != 'Italia'].Regione.unique()

In [214]:
regions_to_filter

array(['Puglia', 'Emilia Romagna', 'Veneto', 'Lombardia', 'Abruzzo',
       'Molise', 'Friuli Venezia Giulia', 'Basilicata', 'Calabria',
       'Sicilia', 'Marche', 'Liguria', 'Campania', 'Sardegna', 'Lazio',
       'Umbria', 'Piemonte', 'Toscana'], dtype=object)

In [215]:
df_traffic_all.loc[df_traffic_all.Regione != 'Italia'].Regione.unique()

array(['Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia Romagna',
       'Friuli Venezia Giulia', 'Lazio', 'Liguria', 'Lombardia', 'Marche',
       'Molise', 'Piemonte', 'Puglia', 'Sardegna', 'Sicilia', 'Toscana',
       'Trentino-Alto Adige', 'Umbria', "Valle d'Aosta", 'Veneto'],
      dtype=object)

In [216]:
regions = set()
for f in json_data.features:
    regions.add(f["properties"]["reg_name"])

In [217]:
json_data_filtered = copy.deepcopy(json_data)
for i, jd in enumerate(json_data_filtered.features):
    region = jd["properties"]["reg_name"]
    if region not in regions_to_filter:
        # TODO provo a mapparlo
        try:
            jd["properties"]["reg_name"] = map_geojson(region)
        except:
            print("MANCA {}".format(region))
            del json_data_filtered.features[i]

In [225]:
df_covid_all.Regione.unique()

array(['Puglia', 'Emilia Romagna', 'Veneto', 'Lombardia', 'Abruzzo',
       'Molise', 'Friuli Venezia Giulia', 'Basilicata', 'Calabria',
       'Sicilia', 'Marche', 'Liguria', 'Campania', 'Sardegna', 'Lazio',
       'Umbria', 'Piemonte', 'Toscana'], dtype=object)

In [218]:
column_regione = 'Regione'

app_map = JupyterDash("CovidMap", external_stylesheets=external_stylesheets)

data_column, column_regione = 'Date', 'Regione'

app_map.layout = html.Div([
    html.Label(
        [
            "Date Picker: ",
            dcc.DatePickerRange(
                id='date-picker-range',
                min_date_allowed=date(2020, 3, 1),
                max_date_allowed=date(2020, 12, 31),
                start_date=date(2020, 3, 1),
                end_date=date(2021, 12, 31)
            ),
        ]
    ),
    html.Label([
            "Covid KPI",
            dcc.Dropdown(id="covidKPI",
                         options=[{"label": x, "value": x} for x in covidKPIs],
                        value=covidKPIs[0],
                        clearable=False,),
        ]
    ),
    html.Label([
            "Traffic KPI",
            dcc.Dropdown(id="trafficKPI",
                         options=[{"label": x, "value": x} for x in trafficKPIs],
                        value=trafficKPIs[0],
                        clearable=False,),
        ]
    ),
    html.Label(
                ["Rolling average window ",
                html.Br(),
                dcc.Input(
                id='rolling_avg',
                type='number',
#                 value=1
                )]
    ),
    html.Label(
                ["Shift COVID",
                html.Br(),
                dcc.Input(
                id='shift',
                type='number',
                )]
    ),
    dcc.Graph(id="CovidMap")]) # , animate = True

# TODO pensare a come finire questo plot: metterei lag, mov avg, date picker per impostare la correlation.
# TODO eventualmente farne un altro in cui seleziono solo uno dei due e ho anche slider

@app_map.callback(
[Output("CovidMap", "figure")], 
    [
         Input('date-picker-range', 'start_date'),
         Input('date-picker-range', 'end_date'),
        Input("covidKPI", "value"), Input("trafficKPI", "value"),Input("rolling_avg", "value"), Input("shift", "value")]
)  
def display_covid_period(start_date, end_date, covidKPI, trafficKPI, roll_avg, shift_amount):

    layout = go.Layout(# width = 770, height=650,
                       margin={"r":5,"t":5,"l":5,"b":5},
                      mapbox = dict(center= {"lat": 41.892770, "lon": 12.483667},
                                    accesstoken=map_token,
                                    zoom=4))
    
    if start_date is not None:
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    if end_date is not None:
        end_date = datetime.strptime(end_date, '%Y-%m-%d')

    df_traffic_all[df_traffic_all[trafficKPI] <= 0] = np.nan
    df_traffic_all.dropna(subset=[trafficKPI], inplace=True)
        
    if roll_avg is None or roll_avg <= 0:
        roll_avg = 1
        
    if shift_amount is None:
        shift_amount = 0
                                    
    df_covid_regions = df_covid_all.loc[df_covid_all.Regione != 'Italia']
    df_traffic_regions = df_traffic_all.loc[df_traffic_all.Regione != 'Italia']
    
    def filter_fun(df):
        return (df['Date'] >= start_date) & (df['Date'] <= end_date)
    
    df_traffic_regions = df_traffic_regions.loc[filter_fun(df_traffic_regions)]
    df_covid_regions = df_covid_regions.loc[filter_fun(df_covid_regions)]
    if shift_amount != 0:
        df_covid_regions = df_covid_regions.shift(shift_amount)
        df_covid_regions['Date'] += pd.DateOffset(shift_amount)

    covidKPI2, trafficKPI2 = covidKPI, trafficKPI    
    
    if roll_avg > 1:
        covidKPI2, trafficKPI2 = "{}_roll".format(covidKPI), "{}_roll".format(trafficKPI)
        df_traffic_regions[trafficKPI2], df_covid_regions[covidKPI2] = \
            df_traffic_regions[trafficKPI].rolling(roll_avg).mean(), \
            df_covid_regions[covidKPI].rolling(roll_avg).mean()
        
    
    map_vals = []
    for region in df_traffic_regions.Regione.unique():
        df_c, df_t = df_covid_regions.loc[df_covid_regions.Regione == region], df_traffic_regions.loc[df_traffic_regions.Regione == region]
        df_c["Correlation"] = df_c[covidKPI2]
        df_t["Correlation"] = df_t[trafficKPI2]
        df_c.set_index('Date', inplace=True)
        df_t.set_index('Date', inplace=True)
        df_c.index = df_c.index.normalize()
        df_t.index = df_t.index.normalize()
        map_vals.append({"Regione": region, "Correlation": df_c.corrwith(df_t)['Correlation']})
    df_corr = pd.DataFrame(map_vals)

    data = [go.Choroplethmapbox( 
                                 locations = df_corr[column_regione],
                                 z = df_corr["Correlation"],
                                 colorscale = 'inferno',
                                 zmin=-1, zmax=1,
    #                              text =regions,
                                 featureidkey="properties.reg_name",
                                 colorbar = dict(thickness=20, ticklen=3),
                                 geojson = json_data_filtered,
                                 marker_line_width=0, marker_opacity=0.7)]

    # Plot the figure 
    fig=go.Figure(data=data, layout=layout) # 
    fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
    
    return [fig] # 
    

app_map.run_server(mode='inline', port=36000, debug=True) #  use_reloader=False, mode='jupyterlab',debug=True,

In [18]:
# TODO e se inserissimo percentuale di terapie intensive occupate???

In [219]:
import itertools
# una riga per ogni (roll, shift); da parte ...
rolls, shifts = [1, 5, 10], [0, 10, 20, 30, 40, 50]
df_vals = []
# filter dates

df_covid_corr, df_traffic_corr = df_covid_all, df_traffic_all
only_SO = True
if only_SO:
    df_covid_corr, df_traffic_corr = df_covid_SO, df_traffic_daily_SO

df_covid_regions = df_covid_corr.loc[df_covid_corr.Regione != 'Italia']
df_traffic_regions = df_traffic_corr.loc[df_traffic_corr.Regione != 'Italia']
regions_corr = df_covid_regions.Regione.unique()

# TODO per ora solo questo
corrCovidKPIs, corrTrafficKPIs = ["%pos", "terapia_intensiva", "R_mean"], ["Handover"] # covidKPIs, trafficKPIs

for roll_avg, shift_covid in itertools.product(*[rolls, shifts]):    
    shifted_df_covid_regions = df_covid_regions.shift(-1*shift_covid)
    shifted_df_covid_regions['Date'] += pd.DateOffset(-1*shift_covid)
    to_copy = {"roll": roll_avg, "shift": shift_covid}
    
    for region in regions_corr:
        row = to_copy.copy()
        row['Regione']=region
        for covidKPI, trafficKPI in itertools.product(*[corrCovidKPIs, corrTrafficKPIs]):
            covidKPI2, trafficKPI2 = "{}_{}_{}".format(covidKPI, roll_avg, shift_covid), "{}_{}_{}".format(trafficKPI, roll_avg, shift_covid)
            df_traffic_regions[trafficKPI2], shifted_df_covid_regions[covidKPI2] = \
                df_traffic_regions[trafficKPI].rolling(roll_avg).mean(), \
                shifted_df_covid_regions[covidKPI].rolling(roll_avg).mean()

            col = "{}_vs_{}".format(covidKPI, trafficKPI)
        
            df_c, df_t = shifted_df_covid_regions.loc[shifted_df_covid_regions.Regione == region], df_traffic_regions.loc[df_traffic_regions.Regione == region]
            df_c["Correlation"] = df_c[covidKPI2]
            df_t["Correlation"] = df_t[trafficKPI2]
            df_c.set_index('Date', inplace=True)
            df_t.set_index('Date', inplace=True)
            df_c.index = df_c.index.normalize()
            df_t.index = df_t.index.normalize()
            row[col] = df_c.corrwith(df_t)['Correlation']
        df_vals.append(row)

In [220]:
# TODO per ogni (covid, traffic), calcolo per rolling avg = {1,3,5, 7}, shift covid = {5, 10, 15, 20} la distribuzione del coeff
df_correlations = pd.DataFrame(df_vals)

In [221]:
# TODO fare grafico con selezione col e dentro boxplot
regionsToShow = list(regions)
regionsToShow.append("Italia")
shifts = range(-50, 50)

name = "BOXPLOT CORRELATIONS per REGION"
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app_boxplot = JupyterDash(name, external_stylesheets=external_stylesheets)

app_boxplot.layout = html.Div([
    html.Label(
        [
            "Date Picker: ",
            dcc.DatePickerRange(
                id='date-picker-range',
                min_date_allowed=date(2020, 3, 1),
                max_date_allowed=date(2020, 12, 31),
                start_date=date(2020, 3, 1),
                end_date=date(2021, 12, 31)
            ),
        ]
    ),
html.Label(
    [
        "TRAFFIC",
        dcc.Dropdown(id="kpi1",
                     options=[{"label": x, "value": x} for x in trafficKPIs],
                    value=trafficKPIs[0],
                    clearable=False)
    ]),
html.Label(
    [
        "COVID",
        dcc.Dropdown(id="kpi2",
                     options=[{"label": x, "value": x} for x in covidKPIs],
                    value=covidKPIs[0],
                    clearable=False)
    ]),
    html.Label(
                ["Rolling average window ",
                html.Br(),
                dcc.Input(
                id='rolling_avg',
                type='number',
#                 value=1
                )]
    ),
html.Div(dcc.Graph(id=name))])

@app_boxplot.callback(
Output(name, "figure"), 
[
     Input('date-picker-range', 'start_date'),
     Input('date-picker-range', 'end_date'),Input("kpi1", "value"), Input("kpi2", "value"), Input("rolling_avg", "value")])
def display_boxplot(start_date, end_date, trafficKPI, covidKPI, roll_avg):
    
    if start_date is not None:
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    if end_date is not None:
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
        
    if roll_avg is None:
        roll_avg = 1
    
    def filter_fun(df):
        return (df['Date'] >= start_date) & (df['Date'] <= end_date)
    
    df_covid_regions, df_traffic_regions = df_covid_all.loc[filter_fun(df_covid_all)], df_traffic_all.loc[filter_fun(df_traffic_all)]
    
    col = "{}_vs_{}".format(covidKPI, trafficKPI)
    df_vals = []
    
    for shift_covid in shifts:    
        shifted_df_covid_regions = df_covid_regions.shift(-1*shift_covid)
        shifted_df_covid_regions['Date'] += pd.DateOffset(-1*shift_covid)
        to_copy = {"roll": roll_avg, "shift": shift_covid}
        
        covidKPI2, trafficKPI2 = covidKPI, trafficKPI

        for region in regions_corr:
            row = to_copy.copy()
            row['Regione']=region
            if roll_avg > 1:
                covidKPI2, trafficKPI2 = "{}_{}_{}".format(covidKPI, roll_avg, shift_covid), "{}_{}_{}".format(trafficKPI, roll_avg, shift_covid)
                df_traffic_regions[trafficKPI2], shifted_df_covid_regions[covidKPI2] = \
                    df_traffic_regions[trafficKPI].rolling(roll_avg).mean(), \
                    shifted_df_covid_regions[covidKPI].rolling(roll_avg).mean()

            df_c, df_t = shifted_df_covid_regions.loc[shifted_df_covid_regions.Regione == region], df_traffic_regions.loc[df_traffic_regions.Regione == region]
            #df_c[covidKPI2] = (df_c[covidKPI2]-df_c[covidKPI2].mean())/df_c[covidKPI2].std()
            #df_t[trafficKPI2] = (df_t[trafficKPI2]-df_t[trafficKPI2].mean())/df_t[trafficKPI2].std()
            df_c["Correlation"] = df_c[covidKPI2]
            df_t["Correlation"] = df_t[trafficKPI2]
            df_c.set_index('Date', inplace=True)
            df_t.set_index('Date', inplace=True)
            df_c.index = df_c.index.normalize()
            df_t.index = df_t.index.normalize()
            row[col] = df_c.corrwith(df_t)['Correlation']
            df_vals.append(row)
    
    df_correlations = pd.DataFrame(df_vals)
    
    layout = go.Layout(
        yaxis=dict(
        range=[-1, 1]
        )
    )
    
    fig = px.box(df_correlations, x="shift", y=col)
    
    # TODO add trace con media
    means = df_correlations.groupby(by=["shift"]).mean()
    medians = df_correlations.groupby(by=["shift"]).median()
    #print(medians)
    #for shift in sorted(df_correlations.shift.unique()):
    #    medians.append(df_correla)
    
    fig.add_trace(go.Scatter(x=means.index, y=means[col],
                    mode='lines',
                    name='Mean'))
    
    fig.add_trace(go.Scatter(x=medians.index, y=medians[col],
                    mode='lines',
                    name='Median'))
    
    # Add figure title
    fig.update_layout(
        title_text="{} vs {}".format(trafficKPI, covidKPI),
        yaxis=dict(
            range=[-1, 1]
        )
    )

    # fig = fill_with_areas(selected_data_groupped['Date'], dateRange, fig)

    return fig

#app_timeseries = build_app_timeseries(df_traffic_daily_SO, df_covid_SO)
app_boxplot.run_server(mode='inline', port=46001) # debug=True, use_reloader=False

In [None]:
# PO: 01/03 -> 31/05 -> 90 -> min #samples = 50
# SO: 01/10 -> 31/12 -> 90 -> min #samples = 50
# ASSESTAMENTO = A = dopo quanti giorni non cresce più; 
# m(R, t) = mediana dopo t giorni con RA = R; var(R, t) = MAX - MIN
# A* = A s.t. max min
# OSS: aumentanto RA diminuisce var(R, t); media risulta simile.
# tenendo RA = 1 è meglio
# SO: ci sono regioni con correlazione mooooooolto bassa
# HANDOVER
# vs % pos, [PO + SO]: A = 30; m(1, 30) = 0.4
# vs % pos, PO: A = 26; m(1, 27) = 0.7 -> luminoso un po' ovunque. A* = 20
# vs % pos, SO: A = 31; m(1, 31) = 0.6; oscilla in (31-38); A* = 28; m(A) = 0, m(A*) = 0.13; però i q1 sono simili

# vs nuovi_positivi, [PO+SO]: A = 30, m(1, 30) = 0.4
# vs nuovi_positivi, PO: A = 30, m(1, 30) = 0.6
# vs nuovi_positivi, SO: A = A* = 36, m(1, 36) = 0.65; m(A*) = 0.24; oscilla in (29-36)

# vs TI, [PO+SO]: A = 38, m(1, 38) = 0.3
# vs TI, PO: A = 40, m(1, 40) = 0.8 -> anche 
# vs TI, SO: A = 40, m(1, 38) = 0.5; m(A*) = -0.5 -> regioni che non seguono trend

# CONCLUSIONI HANDOVER:
# 1) PO risulta essere molto più preciso; questo perchè durante la PO non c'era residuo di positività che, invece, ad esempio nel caso di Veneto, è chiarissimo durante la SO


# Upload volume
# % pos: negativo -> più persone aumentano nell'utilizzo dell'upload, più si è in lockdown -> informazione di come la positività influisce su ...
# vs % pos: PO+SO: A = 29, m(1, 29) = -0.5
# vs % pos, PO: A = 30, m(1, 30) = -0.77; A* = 17, min(A*) = -0.2, min(A) = -0.07
# vs % pos, SO: A = 29; m(1, 29) = -0.6

# vs % n_p: PO+SO: A = 29, m(1, 29) = -0.5
# vs % n_p, PO: A = 27, m(1, 28) = -0.6; A* = 27, min(A*) = -0.4
# vs % n_p, SO: A = 30; m(3, 30) = -0.5; A* = 26, m(A*) = 0 (B)

# vs % TI: PO+SO: A = 40, m(1, 40) = -0.5
# vs % TI, PO: A = 40, m(1, 40) = -0.8; min(A*) = -0.56
# vs % TI, SO: A = 41; m(1, 41) = -0.7; min(A) = 0.6 (B)

In [None]:
# soa di vecchi lavori con questi
# predizione su RT -> un dato a settimana da 
# predizione su TI
#0) DL and UL sono sproporzionati (es: roma - lazio)
#1) timeseries per spiegare correlazione con lag che varia + differenze tra regioni + differenze prima - seconda ondata
#2) boxplots per trovare lags per coppie: Hin vs {covid_features}, upload vs {covid features}
#3) regioni: esempi di correlazione buona in piu regioni

In [None]:
df_lazio_luglio = pd.read_pickle("{}{}".format(by_region_path, "LTE_1800_Lazio_1wave_07.pkl"))
df_citta = pd.read_pickle("{}{}".format(data_path, "LTE_1800_Roma.pkl"))

# LTE_1800_Lombardia_2wave_10.pkl

In [None]:
df_lazio_luglio.loc[df_lazio_luglio.Date > pd.to_datetime('2020-01-01')].resample('D', on='Date').sum().head()

In [None]:
df_roma = df_citta.loc[df_citta.COMUNE=="ROMA"]

In [None]:
df_roma.loc[df_roma.Date > pd.to_datetime('2020-01-01')].resample('D', on='Date').sum().head()

In [None]:
df_napoli = df_citta.loc[df_citta.COMUNE=="NAPOLI"]

In [None]:
df_napoli.loc[df_napoli.Date > pd.to_datetime('2020-01-01')].resample('D', on='Date').sum()

Exception in thread Thread-40:
Traceback (most recent call last):
  File "/opt/local/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/opt/local/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/filipkrasniqi/.local/share/virtualenvs/traffic-covid-andFrCfE/lib/python3.8/site-packages/retrying.py", line 49, in wrapped_f
    return Retrying(*dargs, **dkw).call(f, *args, **kw)
  File "/Users/filipkrasniqi/.local/share/virtualenvs/traffic-covid-andFrCfE/lib/python3.8/site-packages/retrying.py", line 212, in call
    raise attempt.get()
  File "/Users/filipkrasniqi/.local/share/virtualenvs/traffic-covid-andFrCfE/lib/python3.8/site-packages/retrying.py", line 247, in get
    six.reraise(self.value[0], self.value[1], self.value[2])
  File "/Users/filipkrasniqi/.local/share/virtualenvs/traffic-covid-andFrCfE/