In [66]:
%load_ext autoreload
%autoreload 2

# standard imports
import numpy as np
import os
import pandas as pd
import pickle

from datetime import timedelta

from bokeh.io import output_notebook, export_svgs
from bokeh.layouts import gridplot
from bokeh.models import Span
from bokeh.plotting import figure, show, ColumnDataSource
output_notebook()

# lib
import sys
sys.path.append('../')
from metrics import compute_metrics, _compute_metrics
from analysis import load_backfill, plot_accuracy, plot_cases, plot_prediction_interval, plot_metric_for_dates

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data and UPC Detections

In [80]:
# identified outbreaks and their dates by upc
outbreaks = [
    ("Asti", 5, 26),
	("Bergamo", 5, 11),
	("Campobasso", 5, 7),
	("Como", 5, 22),
	("Cosenza", 4, 14),
	("Foggia", 4, 14),
	("Genova", 5, 15),
	("Imperia", 5, 10),
	("Lecco", 4, 26),
	("Lodi", 4, 24),
	("Lodi", 5, 27),
	("Pescara", 5, 2),
	("Prato", 4, 12),
	("Rieti", 4, 22),
	("Rieti", 5, 10),
	("Rovigo", 4, 20),
	("Treviso", 4, 16),
	("Varese", 4, 30),
	("Verbano-Cusio-Ossola", 5, 2),
	("Vercelli", 5, 3),
]

# Load ground truth data
df_regions = pd.read_csv('../data/italy/data-upc.csv', index_col='region')
df_regions = df_regions.transpose()
df_regions.index.set_names(['date'], inplace=True)
df_regions.index = pd.to_datetime(df_regions.index)
print('Days = {}, Regions = {}'.format(*df_regions.shape))

Days = 96, Regions = 106


### Load Backfill and Configs

In [114]:
# cases
job = "it/2020_06_09_14_07"
fs, cfgs = load_backfill(job) 
cfgs.drop(columns=['fdat', 'fpop', 'job']).head()

Unnamed: 0_level_0,activation,decay,loss,lr,momentum,niters,no_cross_correlation,t0,temp_smoothing,test_on,weight_decay,window
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-04-01,sigmoid,latent2_1,nb,0.001,0.9,30000,False,5,0,7,0.1,20
2020-04-02,sigmoid,latent2_1,nb,0.001,0.9,30000,False,5,0,7,0.2,15
2020-04-03,sigmoid,latent2_1,nb,0.001,0.9,30000,False,5,0,7,0.2,15
2020-04-04,sigmoid,latent2_1,nb,0.001,0.9,30000,False,5,0,7,0.2,15
2020-04-05,sigmoid,latent2_1,nb,0.001,0.9,30000,False,5,0,7,0.2,15


In [111]:
ps = []
accs = []
plevel = (.05, .99)

def select_piv(df, interval):
    _df = df.drop(columns=['piv'])
    return _df[df["piv"] == interval].set_index('date')

for date in cfgs.index:
    jobdir = cfgs.loc[date]['job']
    df_piv = pd.read_csv(f'{jobdir}/../forecasts/piv_best_rmse.csv', parse_dates=['date'])
    lower = select_piv(df_piv, str(plevel[0]))
    upper = select_piv(df_piv, str(plevel[1]))
    mean = select_piv(df_piv, "mean")

    ix = np.intersect1d(mean.index, df_regions.index)
    df_gt = df_regions.loc[ix]
    mean = mean.loc[ix]
    lower = lower.loc[ix]
    upper = upper.loc[ix]                        
    
    for d in [6, 13, 20]:
        if d >= len(df_gt):
            continue
        z = np.logical_and(df_gt.iloc[d] < upper.iloc[d], df_gt.iloc[d] > lower.iloc[d])
        acc = sum(z) / len(z)
        accs.append((date, d + 1, acc))

# plot accuracies        
accs = pd.DataFrame(accs, columns=['date', 'days', 'acc'])
p = plot_accuracy(accs, plevel, 'Confirmed Cases, Italy', {'2020-04-19'})
show(p)

### Detected deviations

In [138]:
plevel = (.05, .80)

# plot cases over time 
ps = {}
for region, month, day in outbreaks:
    p = plot_cases(
        df_regions, 
        f'Confirmed cases in {region} (p={plevel[1]})', 
        regions=[region],
        line_width=3,
        alpha=1,
        backend='svg'
    )
    x = pd.to_datetime(f'2020-{month}-{day}')
    vline = Span(location=x, dimension='height', line_color='black', line_width=3, line_dash='dotted')
    p.add_layout(vline)
    p.height = 350
    p.width = 420
    ps[region] = p
regions = list(ps.keys())

for i in range(len(cfgs)):
    cfg = cfgs.iloc[i]
    date = pd.to_datetime(cfgs.index[i])
    jobdir = cfg['job']
    df_piv = pd.read_csv(f'{jobdir}/../forecasts/piv_best_rmse.csv', parse_dates=['date'])
    for region in regions:
        if date + timedelta(2) not in df_regions.index:
            continue
        upper = select_piv(df_piv, str(plevel[1]))[region]
        cases = df_regions.loc[date + timedelta(2)][region]
    
    
        if cases > upper.iloc[1]:
            x = date + timedelta(1)
            vline = Span(location=x, dimension='height', line_color='LightGray', line_width=3)
            vline.level = "underlay"
            ps[region].add_layout(vline)
            #print(region, x)
    
    
grid = gridplot(list(ps.values()), ncols=2)
show(grid)
_ = export_svgs(grid, filename=f'/tmp/it-detection_p{plevel[1]}.svg')