In [1]:
%load_ext autoreload
%autoreload 2

#standard imports
import h5py
import numpy as np
import os
import pandas as pd
import pickle
import torch as th
import yaml

from datetime import timedelta
from pathlib import Path

# bokeh
from bokeh.io import output_notebook, export_png, export_svgs
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, Title, FactorRange, LinearAxis, Legend, Band, Range1d
from bokeh.palettes import Blues
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.sampledata.us_counties import data as counties
output_notebook()

# lib
import sys
sys.path.append('../')
from metrics import compute_metrics, _compute_metrics
from analysis import load_backfill, plot_cases

In [214]:
def aggregate_states(df):
    df = df.transpose()
    df["state"] = [r.split(', ')[-1] for r in df.index]
    df.reset_index(drop=True, inplace=True)
    df = df.groupby('state').sum()
    df = df.transpose()
    df.index.set_names(['date'], inplace=True)
    return df
        
def load_predictions(path):
    df = pd.read_csv(path, index_col='date', parse_dates=['date'])
    return df

def plot_metric(mets, counts, days, title, metric):
    p = figure(
        x_axis_type='datetime', 
        plot_height=350, 
        plot_width=450, 
        title=f"Forecast Quality {title}", 
        tools="save,hover",
        x_axis_label='Day', 
        y_axis_label=metric,
        tooltips=[("Model", "$name"), (metric, "$y")]
     )

    lines = []
    for (name, color, dash, df) in mets:
        l = p.line(x=days, y=df.loc[metric][days], line_width=3, color=color, line_dash=dash, name=name, legend_label=name)
        lines.append(l)
    counts = counts.loc[days].values
    source = ColumnDataSource({'day': days, 'counts': counts})
    p.extra_y_ranges = {"counts": Range1d(start=counts.min(), end=counts.max())}
    p.add_layout(LinearAxis(y_range_name="counts", axis_label='Deaths'), 'right')
    #print(counts.min(), counts.max())
    #p.line(x=days, y=counts, line_width=1, color="LightGray", line_alpha=0.2, y_range_name='counts')
    #band = Band(base='day', upper='count', source=source, level='underlay', fill_alpha=0.5, fill_color='LightGray', y_range_name='counts')
    #p.add_layout(band)
    p.y_range.renderers = lines

    p.legend.location = 'top_left'
    p.output_backend = 'svg'
    p.background_fill_color = 'white'
    p.border_fill_color = 'white'
    p.outline_line_color = 'white'
    p.title.text_font = 'Montserrat'
    p.title.text_font_style = 'normal'
    p.title.text_color = '#677b8c'

    return p

def metrics_for_date(fs, gt, date, model, state=None):
    df_other = load_predictions(f'/checkpoint/mattle/covid19/csvs/deaths/{model[0]}/counts_{date}.csv')#.iloc[1:]
    df_yyg = load_predictions(f'/checkpoint/mattle/covid19/csvs/deaths/yyg/counts_{date}.csv')#.iloc[1:]
    df_ar = aggregate_states(pd.read_csv(fs[date], index_col='date', parse_dates=['date']))
    if state is not None:
        df_other = df_other[state].to_frame()
        df_ar = df_ar[state].to_frame()
        df_yyg = df_yyg[state].to_frame()
        df_gt = df_gt[state].to_frame()

    met_ar =  _compute_metrics(gt, df_ar)
    met_naive = met_ar.copy()
    met_naive.loc['MAE'] = met_naive.loc['MAE_NAIVE']
    days = met_ar.columns
    mets = [
        (model[1], '#009ed7', 'solid', _compute_metrics(gt, df_other)),
        ('FAIR-AR', 'black', 'solid', met_ar),
        ('YYG', '#f29111', 'solid',  _compute_metrics(gt, df_yyg)),
        ('Naive', "#009ed7", 'dotted', met_naive),
    ]
    return mets, days

def plot_metric_for_dates(fs, gt, dates, other, model, metric='MAE', state=None):
    ps = []
    for date in dates:
        mets, days = metrics_for_date(fs, gt, date, (model, other), state)
        counts = gt.sum(axis=1)

        region = 'US' if state is None else state
        p = plot_metric(mets, counts, days, f'{region} {date}', metric)
        ps.append(p)
    return ps

### Progression in the US

In [208]:
# Load ground truth data
df_states = aggregate_states(pd.read_csv('../data/usa/data_deaths.csv', index_col='region').transpose())
df_states.index = pd.to_datetime(df_states.index)

# plot deaths over time 
print(df_states.shape)
p = plot_cases(df_states, "Morbidity per US State")
show(p)

(134, 51)


In [209]:
from scipy.fft import fft, ifft
#source = ColumnDataSource(df_states.iloc[60:])
p = figure(
    x_axis_type='datetime', 
    plot_height=350, 
    plot_width=500, 
    title=f"FFT per US State", 
    tools="save,hover",
    x_axis_label='Day', 
    y_axis_label='Deaths',
)
states = df_states.columns
#states = ['Hawaii']
states = ['New York', 'New Jersey', 'California']
for state in states:
    c = df_states[state]
    y = np.diff(c.values)
    p.line(x=c.index[1:], y=y, line_width=2, color='#009ed7')
p.output_backend = 'svg'
show(p)

### Load Backfill and Configs

In [279]:
#job = "us/2020_05_08_18_10"
#job = "us/2020_05_09_06_25"
# job = "us/2020_05_09_07_50"
#job = "us/2020_05_09_08_06"
#job = "us/2020_05_09_12_58"
#job = "us/2020_05_09_13_08"
#job = "us/2020_05_09_15_06"
#job = "us/2020_05_09_15_29"
#job = "us/2020_05_09_17_28"
#job = "us/2020_05_09_18_49"
#job = "us/2020_05_12_16_02"
#job = "us/2020_05_17_16_27" ## ok
job = "us/2020_05_19_20_28" ## ok
#job = "us/2020_05_23_08_43"
#job = "us/2020_05_25_19_41"
#job = "us/2020_05_25_20_08"
#job = "us/2020_05_25_20_43"
job = "us/2020_05_25_20_55"
#job = "us/2020_05_25_21_20"
#job = "us/2020_05_25_21_30"
job = "us/2020_05_30_18_35"
job = "us/2020_05_30_20_10"
job = "us/2020_05_30_20_48"
#job = "us/2020_05_30_21_21"
job = "us/2020_05_31_14_08"
job = "us/2020_06_01_20_15"
#job = "us/2020_06_02_17_00"
job = "us/2020_06_02_21_36"
job = "us/2020_06_03_13_08"
#job = "us/2020_06_03_13_52"
#job = "us/2020_06_03_14_46"
job = "us/2020_06_04_08_45"
job = "us/2020_06_05_15_06"
job = "us/2020_06_06_18_26"
job = ""

#fs, cfgs = load_backfill(job, model='ar', indicator="*_forecast.csv", forecast="final_model_best_mae_forecast.csv") 
fs, cfgs = load_backfill(job, model='car', forecast="../forecasts/forecast_best_rmse.csv") 
cfgs.drop(columns=['fdat', 'fpop', 'job'])

Unnamed: 0_level_0,blocks,kernel_size,layers,loss,lr,momentum,niters,t0,test_on,weight_decay
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-04-15,2,2,10,nb,0.001,0.9,30000,10,21,0.2
2020-04-19,3,2,10,nb,0.001,0.9,30000,10,21,0.3
2020-04-22,3,2,10,nb,0.001,0.9,30000,10,21,0.2
2020-04-26,3,2,10,nb,0.001,0.9,30000,10,21,0.2
2020-04-29,2,2,6,nb,0.001,0.9,30000,10,21,0.3
2020-05-03,3,2,6,nb,0.001,0.9,20000,10,21,0.1
2020-05-06,3,2,6,nb,0.001,0.9,20000,10,21,0.1
2020-05-10,2,2,6,nb,0.001,0.9,20000,10,21,0.1


### Los Alamos

Compare our forecasts to published data by [Los Alamos National Laboratory](https://covid-19.bsvgateway.org/)

In [280]:
dates_los_alamos = [
    #'2020-04-05', 
    #'2020-04-08', 
    #'2020-04-12',
    '2020-04-15',
    '2020-04-19',
    '2020-04-22',
    '2020-04-26', 
    '2020-04-29',
    '2020-05-03',
    '2020-05-06',
    '2020-05-10',
]
ps = plot_metric_for_dates(fs, df_states, dates_los_alamos, 'Los Alamos', 'los_alamos', 'MAE', None)
grid = gridplot(ps, ncols=2, plot_width=430)
show(grid)
_ = export_svgs(grid, filename='/tmp/lanl.svg')

In [51]:
ps = plot_metric_for_dates(fs, df_states, dates_los_alamos, 'Los Alamos', 'los_alamos', 'MAE', 'Hawaii')
show(ps[0])

UnboundLocalError: local variable 'df_gt' referenced before assignment

In [283]:
def plot_error(fs, gt, date, title, regions=None, height=400, width=600, backend='svg'):
    df_ar = aggregate_states(pd.read_csv(fs[date], index_col='date', parse_dates=['date']))
    ix = np.intersect1d(pd.to_datetime(df_ar.index), pd.to_datetime(gt.index))
    df_gt = gt.loc[ix]
    df_ar = df_ar.loc[ix]
    df = df_ar - df_gt
    source = ColumnDataSource(df)
    p = figure(
        x_axis_type="datetime",
        plot_height=height,
        plot_width=width,
        title=title,
        tools="save,hover",
        x_axis_label="Day",
        y_axis_label="Error",
        tooltips=[("State","$name"), ("Error", "$y")]
    )
    if regions is None:
        regions = df.columns
    for region in regions:
        p.line(x="date", y=region, source=source, line_width=2, color="#009ed7", alpha=0.5, name=region)
    p.output_backend = backend
    return p

p = plot_error(fs, df_states, '2020-05-03', 'Prediction Error - US States')
show(p)    

In [53]:
df_r = aggregate_states(pd.read_csv(fs['2020-05-03'], index_col='date', parse_dates=['date']))['New York']
pd.concat([df_r, df_states.loc[df_r.index]['New York']], axis=1)

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

### Model

In [236]:
import importlib
from argparse import Namespace
def load_model(cfgs, date, model='ar'):
    mod = importlib.import_module(f'{model}')
    jobdir = cfgs['job'].loc[date]
    modcfg = yaml.load(open(f'{jobdir}/{model}.yml'), Loader=yaml.FullLoader)
    modelcv = mod.CV_CLS()
    modelcv.initialize(Namespace(**modcfg['train']))
    modelcv.func.load_state_dict(th.load(f'{jobdir}/{model}_model.bin'))
    return modelcv.func
model = load_model(cfgs, '2020-05-06')
x = np.arange(86) + 1
betas = model.beta(th.from_numpy(x).cuda()).detach().cpu().numpy()
h0 = model.beta.h0.detach().cpu().numpy()[0]

#regions = ['New York City, New York', 'Bergen, New Jersey', model.regions[0]]
regions = model.regions

p = figure(
    plot_height=400,
    plot_width=500,
    title='Beta',
    tools="save,hover",
    x_axis_label="Day",
    y_axis_label="Beta",
    tooltips=[("Region","$name"), ("Beta", "$y")]
)
for region in regions:
    ix = np.where(model.regions == region)[0]
    p.line(x=x, y=betas[ix].flatten(), line_width=1, color="#009ed7", name=region)
    # print(h0[ix])
p.output_backend = 'svg'
show(p)

Timeseries length 96
torch.Size([96, 1799, 6]) torch.Size([1799, 96])


In [166]:
for date in ['2020-04-26', '2020-05-03', '2020-05-06']:
    cfg = cfgs.loc[date]
    forecast = pd.read_csv(cfg['job'] + '/final_model_best_mae_forecast.csv', index_col='date')
    p = forecast.loc['2020-05-16'].sum()
    print(date, p, 88751 - p)

2020-04-26 77669.34785351716 11081.65214648284
2020-05-03 88658.62278424855 92.37721575144678
2020-05-06 89029.5102108107 -278.51021081069484


### IHME

In [166]:
dates_ihme = [
    "2020-04-15",
    "2020-04-19",
    '2020-04-26',
    '2020-05-01',
    '2020-05-10'
]
ps = plot_metric_for_dates(fs, df_states, dates_ihme, 'IHME', 'IHME')
grid = gridplot(ps, ncols=2, plot_width=430)
show(grid)
_ = export_svgs(grid, filename='/tmp/ihme.svg')

### Northeastern GLEAM

In [556]:
dates_gleam = [
    #'2020-04-12',
    #'2020-04-19', 
    '2020-04-27',
    #'2020-05-05',
]
p = plot_metric_for_dates(fs, dates_gleam, 'GLEAM', '../data/gleam/predictions_{}.csv', 'MAE')
show(p)
_ = export_png(p, filename='/tmp/gleam.png')

2020-04-28     21.156863
2020-04-29     43.372549
2020-04-30     61.980392
2020-05-01     80.078431
2020-05-02     95.921569
2020-05-03    106.549020
2020-05-04    113.156863
2020-05-05    136.843137
2020-05-06    145.705882
2020-05-07    167.647059
2020-05-08    186.156863
Name: MAE_NAIVE, dtype: float64
2020-04-28     14.625000
2020-04-29     28.500000
2020-04-30     38.062500
2020-05-01     52.395833
2020-05-02     65.833333
2020-05-03     76.791667
2020-05-04     85.645833
2020-05-05    100.583333
2020-05-06    103.770833
2020-05-07    120.291667
2020-05-08    137.416667
Name: MAE_NAIVE, dtype: float64


### MIT

In [573]:
dates_mit = [
    '2020-04-23',
    '2020-04-28', 
    '2020-05-01',
    #'2020-05-03',
]
p = plot_metric_for_dates(fs, dates_mit, 'MIT', '../data/mit/predictions_{}.csv', 'MAE')
show(p)
_ = export_png(p, filename='/tmp/mit.png')

2020-04-24    118.784314
2020-04-25    112.313725
2020-04-26     96.235294
2020-04-27     91.627451
2020-04-28     91.098039
2020-04-29     98.333333
2020-04-30    100.725490
2020-05-01     95.137255
2020-05-02     87.901961
2020-05-03     79.941176
2020-05-04     66.568627
2020-05-05     73.960784
2020-05-06     85.764706
2020-05-07     92.843137
2020-05-08    108.098039
Name: MAE, dtype: float64
2020-04-29    103.686275
2020-04-30    105.607843
2020-05-01    101.784314
2020-05-02     93.058824
2020-05-03     88.901961
2020-05-04     87.686275
2020-05-05     90.019608
2020-05-06    103.568627
2020-05-07    105.980392
2020-05-08    109.960784
Name: MAE, dtype: float64
2020-05-02    110.764706
2020-05-03    107.019608
2020-05-04    100.862745
2020-05-05    108.450980
2020-05-06    126.392157
2020-05-07    130.941176
2020-05-08    133.294118
Name: MAE, dtype: float64
