# Delay distribution analysis

In [None]:
# standard
import pickle
from datetime import timedelta, date

# third party
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.dates import DateFormatter

# first party
from config import Config

In [None]:
naive_dir = "../data/naive_delay_distributions"
km_dir = "../data/km_delay_distributions"

In [None]:
truth_pmfs = pickle.load(open(f'{naive_dir}/uncensored_delay_distribution.p', 'rb'))

In [None]:
all_errors = []
d = Config.max_delay_days
days_back = Config.distribution_support
as_of_range = [d.date() for d in pd.date_range(date(2020, 10, 1), date(2021, 2, 1))]

for lag in days_back:
    errors = {'KM-adjusted': np.full((len(as_of_range), d+1), np.nan),
              'Naive': np.full((len(as_of_range), d+1), np.nan)}
    for i, as_of in enumerate(as_of_range):
        naive_pmfs = pickle.load(open(f'{naive_dir}/delay_distribution_as_of_{as_of}.p', 'rb'))
        km_pmfs = pickle.load(open(f'{km_dir}/delay_distribution_as_of_{as_of}.p', 'rb'))
        onset_date = as_of - timedelta(int(lag)) # 0 for the date with the most trucation
        naive_pmf = naive_pmfs[onset_date]
        km_pmf = km_pmfs[onset_date]
        truth_pmf = truth_pmfs[onset_date]
        errors['KM-adjusted'][i, :] = np.abs(truth_pmf - km_pmf)
        errors['Naive'][i, :] = np.abs(truth_pmf - naive_pmf)

    err_df = []
    for k, v in errors.items():
        tmp = pd.DataFrame(v, index=as_of_range).reset_index()
        tmp = tmp.melt(id_vars=['index'], 
                        var_name='delay_length', 
                        value_name='abs_err')
        tmp['method'] = k
        err_df.append(tmp)

    err_df = pd.concat(err_df, ignore_index=True)
    err_df['days_back'] = lag
    all_errors.append(err_df)

In [None]:
all_errors = pd.concat(all_errors, ignore_index=True)
l1_summary = all_errors.groupby(['days_back', 'index', 'method']).sum().reset_index()

In [None]:
plt.figure(figsize=(5, 5))
sns.lineplot(data=l1_summary,
             x='days_back',
             y='abs_err', 
             hue='method',
             hue_order=['Naive', 'KM-adjusted'], 
             marker='.')
plt.xlabel('Days back from nowcast time')
plt.ylabel('Mean $\ell_1$ distance')
plt.xticks([1, 10, 20, 30, 40])
plt.legend(title=None)
plt.tight_layout()
plt.savefig(f'./figures/delay_dist_error/l1_summary.pdf')
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
sns.lineplot(data=l1_summary[l1_summary.days_back.lt(12)],
             x='days_back', 
             y='abs_err', 
             hue='method', 
             hue_order=['Naive', 'KM-adjusted'], 
             marker='.')
plt.xlabel('Days back from nowcast time')
plt.ylabel('Mean $\ell_1$ distance')
plt.xticks(range(1, 12, 2))
plt.legend(title=None)
plt.tight_layout()
# plt.savefig(f'./figures/delay_dist_error/l1_summary.pdf')
plt.show()

In [None]:
g = sns.FacetGrid(all_errors[all_errors.delay_length.gt(0)],
                  col='days_back', 
                  hue='method', 
                  hue_order=['Naive', 'KM-adjusted'], 
                  col_wrap=6)
g.map(sns.lineplot, 'delay_length', 'abs_err', marker='.')
g.set_axis_labels('Delay from onset to\nreport (days)', 'Mean absolute error', 
                  clear_inner=False, size=14)
g.set_titles("{col_name} days back", size=14)
g.axes[0].legend(title=None, fontsize=14)
plt.tight_layout()
plt.savefig(f'./figures/delay_dist_error/delay_dist_error_all_lags.pdf')
plt.show()

In [None]:
as_of_range = [d.date() for d in pd.date_range(date(2020, 11, 1), date(2020, 12, 1))]

plt.figure(figsize=(5, 5))
naive_pmfs = []
km_pmfs = []
ref_pmfs = []
for as_of in as_of_range:
    naive = pickle.load(open(f'{naive_dir}/delay_distribution_as_of_{as_of}.p', 'rb'))[as_of]
    km = pickle.load(open(f'{km_dir}/delay_distribution_as_of_{as_of}.p', 'rb'))[as_of]
    ref_pmf = truth_pmfs[as_of]

    naive_pmfs.append(naive[1:])
    km_pmfs.append(km[1:])
    ref_pmfs.append(ref_pmf[1:])
    plt.plot(ref_pmfs[-1], color='gray', alpha=0.3, lw=0.85)
    plt.plot(naive_pmfs[-1], color='tab:blue', alpha=0.2, lw=0.85)
    plt.plot(km_pmfs[-1], color='tab:orange', alpha=0.2, lw=0.85)
    
plt.plot(np.median(ref_pmfs, axis=0),
         label='Finalized', color='black', lw=2)
plt.plot(np.median(naive_pmfs, axis=0), color='tab:blue', 
         label='Naive', lw=1.5)
plt.plot(np.median(km_pmfs, axis=0), color='tab:orange', 
         label='KM-adjusted', lw=1.5)
plt.xlabel('Delay from onset to report (days)')
plt.ylabel('Density')
plt.xticks([1, 10, 20, 30, 40])
plt.legend()
plt.tight_layout()
plt.savefig(f'./figures/delay_dist_error/range_overlay.pdf')
plt.show()

## Plot quantiles

In [None]:
def get_quantile(delay_list, tau):
    return np.argmax(np.cumsum(delay_list) >= tau)

run_date_range = [d.date() for d in pd.date_range(date(2020, 6, 1), date(2021, 6, 1))]
q50 = [get_quantile(truth_pmfs[d], 0.5) for d in run_date_range]
q75 = [get_quantile(truth_pmfs[d], 0.75) for d in run_date_range]
q95 = [get_quantile(truth_pmfs[d], 0.95) for d in run_date_range]

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(5, 5))
sns.lineplot(x=run_date_range, y=q50,
              ax=ax, label='$q_{50}$')
sns.lineplot(x=run_date_range, y=q75,
             ax=ax, label='$q_{75}$')
sns.lineplot(x=run_date_range, y=q95,
              ax=ax, label='$q_{95}$')
ax.legend()
ax.set_ylabel('Delay from onset to report (days)')
date_form = DateFormatter("%Y-%m")
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.xaxis.set_major_formatter(date_form)
ax.set_xlabel("Date")
plt.tight_layout()
plt.savefig('./figures/finalized_delay_quantiles.pdf')

In [None]:
df1 = pd.DataFrame({'Density': truth_pmfs[date(2020, 7, 1)], 'Nowcast date': date(2020, 7, 1)})
df2 = pd.DataFrame({'Density': truth_pmfs[date(2020, 12, 1)], 'Nowcast date': date(2020, 12, 1)})
df3 = pd.DataFrame({'Density': truth_pmfs[date(2021, 5, 1)], 'Nowcast date': date(2021, 5, 1)})
df = pd.concat([df1, df2, df3])
df.reset_index(inplace=True)
df['days_back'] = df['index']

fig, ax = plt.subplots(nrows=1, figsize=(5, 5))
sns.lineplot(data=df[df.days_back.ge(1)], 
             x='days_back', y='Density', hue='Nowcast date', ax=ax)
ax.set_xlabel('Delay from onset to report (days)')
ax.set_xticks([1, 10, 20, 30, 40])
plt.tight_layout()
plt.savefig('./figures/overlay_finalized_delay_dist.pdf')