In [1]:
%load_ext autoreload
%autoreload 2

#standard imports
import numpy as np
import os
import pandas as pd

from collections import defaultdict as ddict
from datetime import timedelta

# bokeh
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import FactorRange
from bokeh.transform import factor_cmap
from bokeh import palettes
output_notebook()

# lib
import sys
sys.path.append('../')
from metrics import _compute_metrics
from analysis import load_backfill, show

def grouped_bar_chart(x, counts, palette, labels, models):
    source = ColumnDataSource(data=dict(x=x, counts=counts))
    title, ylabel = labels
    p = figure(
        x_range=FactorRange(*x), 
        plot_height=250, 
        plot_width=850,
        title=title, 
        tools="save", 
        x_axis_label='Projection date',
        y_axis_label=ylabel,
    )

    p.vbar(x='x', top='counts', width=0.9, source=source, line_color="black",
       fill_color=factor_cmap('x', palette=palette, factors=models, start=1, end=2))

    p.y_range.start = 0
    p.x_range.range_padding = 0.02
    p.xaxis.major_label_orientation = 1
    p.xgrid.grid_line_color = None
    p.output_backend = 'svg'
    ps.append(p)
    return ps

def compute_metrics(ground_truth, dfs):
    res = {
        'Date': [], 
        'MAE 7d': [], 
        'MAE 14d': [], 
        'RMSE 7d': [], 
        'RMSE 14d': [],
    }
    for date, df in dfs.items():
        if df is None:
            for key in res.keys():
                if key == 'Date':
                    res[key].append(date)
                else:
                    res[key].append(np.nan)
            continue
        met = _compute_metrics(ground_truth, df)
        res['Date'].append(date)
        res['MAE 7d'].append(met.loc['MAE'].iloc[:7].mean())
        res['MAE 14d'].append(met.loc['MAE'].iloc[:14].mean())
        res['RMSE 7d'].append(met.loc['RMSE'].iloc[:7].mean())
        res['RMSE 14d'].append(met.loc['RMSE'].iloc[:14].mean())
        #res['MAE MASE 7d'].append(met.loc['MAE_MASE'].iloc[:7][-1])
        #res['MAE MASE 14d'].append(met.loc['MAE_MASE'].iloc[:14][-1])
    res = pd.DataFrame(res)
    res['Date'] = pd.to_datetime(res['Date'])
    return res

In [2]:
case_type = ('cases', 'infections')
region = 'US'
region_short = 'usa'
f_ground_truth = f'../data/usa/data_{case_type[0]}.csv'

forecasts = {
    'Columbia 80contact': (f'/checkpoint/mattle/covid19/csvs/{case_type[1]}/columbia_80contact/counts_{{}}.csv', '#f29111', 'solid', True),
    'Columbia nochange': (f'/checkpoint/mattle/covid19/csvs/{case_type[1]}/columbia_nochange/counts_{{}}.csv', '#f29111', 'solid', True),
    'Columbia season4': (f'/checkpoint/mattle/covid19/csvs/{case_type[1]}/columbia_season4/counts_{{}}.csv', '#f29111', 'solid', True),
}

labels = {
    'MAE 7d': ('US Counties - 7 day projection', 'Avg. MAE'),
    'MAE 14d': ('US Counties - 14 day projection', 'Avg. MAE'),
    'RMSE 7d': ('US Counties - 7 day projection', 'Avg. RMSE'),
    'RMSE 14d': ('US Counties - 14 day projection', 'Avg. RMSE'),
}

google_results = {
    'Date': pd.to_datetime(['2020-05-11', '2020-05-18', '2020-05-25', '2020-05-30', '2020-06-08', '2020-06-20', '2020-06-27']), 
    'MAE 7d': [26.01, 18.82, 20.83, 21.19, 20.77, 23.72, 34.46], 
    'MAE 14d': [66.75, 46.48, 40.66, 41.49, 39.10, 60.08, 73.25], 
    'RMSE 7d': [78.41, 46.82, 85.96, 79.05, 79.46, 114.29, 163.31], 
    'RMSE 14d': [272.87, 152.81, 203.96, 174.16, 186.96, 375.98, 464.14], 
}
google_results = pd.DataFrame(google_results)
display(google_results)
metrics = google_results.columns[1:]
google_results = pd.melt(
    google_results, 
    id_vars='Date', 
    value_vars=metrics, 
    var_name='metric', 
    value_name='Google'
).sort_values(by='Date')

Unnamed: 0,Date,MAE 7d,MAE 14d,RMSE 7d,RMSE 14d
0,2020-05-11,26.01,66.75,78.41,272.87
1,2020-05-18,18.82,46.48,46.82,152.81
2,2020-05-25,20.83,40.66,85.96,203.96
3,2020-05-30,21.19,41.49,79.05,174.16
4,2020-06-08,20.77,39.1,79.46,186.96
5,2020-06-20,23.72,60.08,114.29,375.98
6,2020-06-27,34.46,73.25,163.31,464.14


### Progression of Cases

In [4]:
# Load ground truth data
df_region = pd.read_csv(f_ground_truth, index_col='region').transpose()
df_region.index.set_names(['date'], inplace=True)
df_region.index = pd.to_datetime(df_region.index)
print('Days = {}, Regions = {}'.format(*df_region.shape))

# plot cases over time 
#p = plot_cases(df_region, f"Confirmed cases in {region}", show_hover=False)
#show(p)

Days = 213, Regions = 3120


### Load Backfill and Configs

In [5]:
job = "us/2020_08_04_20_41_23"
job = "us/2020_08_04_20_44_49"
job = "us/2020_08_05_06_49_30"
job = "us/2020_08_05_11_18_55"
job = "us/2020_08_14_19_20_30" # best
#job = "us/2020_08_24_06_30_44"

fs, cfgs = load_backfill(job, model='bar', forecast="best_rmse")
cfgs.drop(columns=['fdat', 'fpop', 'job'])

Unnamed: 0_level_0,activation,decay,dropout,eta,granger,loss,lr,momentum,n_models,niters,no_cross_correlation,t0,temporal,test_on,time_features,weight_decay,window
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-05-11,sigmoid,lstm2_2,0.2,0.1,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,15
2020-05-18,sigmoid,lstm2_2,0.2,0.1,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,20
2020-05-25,sigmoid,lstm2_2,0.1,0.1,0.5,nb,0.001,0.9,1,30000,False,0,1000,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,20
2020-05-30,sigmoid,lstm2_2,0.1,0.2,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,20
2020-06-08,sigmoid,lstm2_2,0.1,0.2,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.2,15
2020-06-20,sigmoid,lstm2_2,0.1,0.2,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,25
2020-06-27,sigmoid,lstm2_2,0.1,0.1,0.5,nb,0.001,0.9,1,30000,False,0,100,21,[/checkpoint/maxn/covid19/forecasts/us/2020_08...,0.1,20


### Forcast Comparison

Compare our forecasts to published data by 
- Google
- Columbia

In [6]:
models = ['bAR', 'Google', 'Columbia']
columbia_dates = ['2020-05-10', '2020-05-17', '2020-05-24', '2020-05-31', '2020-06-07', '2020-06-21', '2020-06-28']

mets = google_results
dfs = ddict(dict)
for date in cfgs.index:
    dfs['bAR'][date] = pd.read_csv(fs[date], index_col="date", parse_dates=["date"])
_mets = compute_metrics(df_region, dfs['bAR'])
_mets = pd.melt(_mets, id_vars='Date', value_vars=metrics, var_name='metric', value_name='bAR').sort_values(by='Date')
mets = pd.merge_asof(mets, _mets, on='Date', by='metric')

for date in columbia_dates:
    for key, f in forecasts.items():
        fin = f[0].format(date)
        if os.path.exists(fin):
            dfs[key][date] = pd.read_csv(fin, index_col="date", parse_dates=["date"])
        else:
            dfs[key][date] = None
for key in forecasts.keys():
    _mets = compute_metrics(df_region, dfs[key])
    _mets = pd.melt(_mets, id_vars='Date', value_vars=metrics, var_name='metric', value_name=key).sort_values(by='Date')
    mets = pd.merge_asof(mets, _mets, on='Date', by='metric')

columbia = [x for x in mets.columns if x.startswith('Columbia')]
mets['Columbia'] = mets[columbia].min(axis=1)
display(mets)

Unnamed: 0,Date,metric,Google,bAR,Columbia 80contact,Columbia nochange,Columbia season4,Columbia
0,2020-05-11,MAE 7d,26.01,11.807091,19.972699,,,19.972699
1,2020-05-11,RMSE 14d,272.87,91.981844,122.148083,,,122.148083
2,2020-05-11,RMSE 7d,78.41,51.466525,98.255049,,,98.255049
3,2020-05-11,MAE 14d,66.75,21.115837,28.565697,,,28.565697
4,2020-05-18,MAE 7d,18.82,11.342453,19.077068,,,19.077068
5,2020-05-18,RMSE 7d,46.82,47.647272,102.912351,,,102.912351
6,2020-05-18,RMSE 14d,152.81,93.978391,154.620296,,,154.620296
7,2020-05-18,MAE 14d,46.48,21.006642,30.611137,,,30.611137
8,2020-05-25,RMSE 7d,85.96,64.491719,124.217943,,,124.217943
9,2020-05-25,RMSE 14d,203.96,120.872532,177.655142,,,177.655142


In [7]:
ms = pd.melt(mets, id_vars=['Date', 'metric'], value_vars=models, var_name='model')
sources = {m: ([], []) for m in metrics}

for (date, model), vals in ms.groupby(by=['Date', 'model']):
    vals = vals.set_index('metric').round(2)
    for m in metrics:
        sources[m][0].append((date.strftime('%m/%d'), model))
        sources[m][1].append(vals.loc[m]['value'])
        
ps = []
palette = palettes.RdYlBu[3]
for metric, (x, counts) in sources.items():
    ps = grouped_bar_chart(x, counts, palette, labels[metric], models)

plot = gridplot(ps, ncols=1)
show(plot, 'img/comparison_goog_columb.png')

## Ablation Model

In [8]:
job_no_cross = "us/2020_08_25_19_47_35"
job_no_granger = "us/2020_08_25_14_04_23"
job_time_features = "us/2020_08_12_07_48_46"

fs_no_cross, cfgs_no_cross = load_backfill(job_no_cross, model='bar', forecast="best_rmse")
fs_no_granger, cfgs_no_granger = load_backfill(job_no_granger, model='bar', forecast="best_rmse")
# fs_tf, cfgs_tf = load_backfill(job_time_features, model='bar_time_features', forecast="best_rmse")

In [9]:
dfs = ddict(dict)
for date in cfgs.index:
    dfs['Full'][date] = pd.read_csv(fs[date], index_col="date", parse_dates=["date"])
    # dfs['Time features'][date] =  pd.read_csv(fs_tf[date], index_col="date", parse_dates=["date"])
    dfs['No Granger'][date] = pd.read_csv(fs_no_granger[date], index_col="date", parse_dates=["date"])
    dfs['No Cross'][date] = pd.read_csv(fs_no_cross[date], index_col="date", parse_dates=["date"])
    
mets = None
for mode in ['Full', 'No Granger', 'No Cross']:
    _mets = compute_metrics(df_region, dfs[mode])
    _mets = pd.melt(_mets, id_vars='Date', value_vars=metrics, var_name='metric', value_name=mode).sort_values(by='Date')
    mets = pd.merge_asof(mets, _mets, on='Date', by='metric') if mets is not None else _mets
mets

Unnamed: 0,Date,metric,Full,No Granger,No Cross
0,2020-05-11,MAE 7d,11.807091,12.412893,248.411
1,2020-05-11,RMSE 14d,91.981844,90.479965,974327600.0
2,2020-05-11,RMSE 7d,51.466525,52.768128,6898.928
3,2020-05-11,MAE 14d,21.115837,21.826175,31418820.0
4,2020-05-18,MAE 7d,11.342453,13.04592,9972771.0
5,2020-05-18,RMSE 7d,47.647272,54.913894,269119700.0
6,2020-05-18,RMSE 14d,93.978391,107.999367,6.637675e+18
7,2020-05-18,MAE 14d,21.006642,25.484661,2.459664e+17
8,2020-05-25,RMSE 7d,64.491719,87.607335,149496900.0
9,2020-05-25,RMSE 14d,120.872532,166.883936,2.908015e+18


In [10]:
models = ['Full', 'No Granger']
ms = pd.melt(mets, id_vars=['Date', 'metric'], value_vars=models, var_name='model')
sources = {m: ([], []) for m in metrics}

for (date, model), vals in ms.groupby(by=['Date', 'model']):
    vals = vals.set_index('metric').round(2)
    for m in metrics:
        sources[m][0].append((date.strftime('%m/%d'), model))
        sources[m][1].append(vals.loc[m]['value'])
        
ps = []
palette = palettes.RdYlBu[3]
#palette = ["#30a2da", "#fc4f30", "#e5ae38", "#6d904f", "#8b8b8b",]
for metric, (x, counts) in sources.items():
    ps = grouped_bar_chart(x, counts, palette, labels[metric], models)
    
for plot in ps:
    show(plot)

## Ablation Features

In [11]:
jobs = {
    'no_testing': [""],
    'no_fb_mobility': [""],
    'no_goog_mobility': [""],
    'no_weather': [""],
    'no_symptom_survey': [""],
    'no_doctor_visits': [""],
}