# Generate naive delay distributions

Data was downloaded from public CDC linelist on September 9, 2021.

https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf


In [None]:
# standard
import pickle
from datetime import timedelta, date

# third party
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from pandas import read_csv, date_range
from tqdm.notebook import tqdm

# first party
from config import Config

## Read in linelist data.

In [None]:
surveil_df = read_csv("../data/COVID-19_Case_Surveillance_Public_Use_Data_20210909.csv",
                      usecols=["cdc_report_dt", "onset_dt"],
                      parse_dates=["cdc_report_dt", "onset_dt"])
surveil_df.onset_dt = surveil_df.onset_dt.dt.date
surveil_df.cdc_report_dt = surveil_df.cdc_report_dt.dt.date

# Remove missing onset rows, and data prior to our assumed first reliable day of data.
linelist = surveil_df[~surveil_df.onset_dt.isna()]
linelist = linelist[linelist.cdc_report_dt.ge(Config.first_data_date)]

## Calculate reporting delays.

In [None]:
linelist['report_delay'] = (linelist.cdc_report_dt - linelist.onset_dt).dt.days
linelist = linelist[linelist.report_delay.gt(0) & linelist.report_delay.le(Config.max_delay_days)]

In [None]:
storage_dir = '../data/naive_delay_distributions'
d = Config.max_delay_days
window_size = 2*d
support = Config.distribution_support

In [None]:
for as_of in tqdm(Config.as_of_range):
    last_truncated = as_of - timedelta(Config.max_delay_days+1)
    truncated_dates = [d.date() for d in date_range(last_truncated, as_of)]
    fair_df = linelist[linelist.cdc_report_dt.lt(as_of)]

    delay_dists = {}
    t = as_of
    for working_onset_date in truncated_dates:
        min_date = working_onset_date - timedelta(window_size) + timedelta(1)
        max_date = working_onset_date
        delay_df = fair_df[fair_df.onset_dt.ge(min_date) & fair_df.onset_dt.le(max_date)]

        # Calculate empirical distribution
        emp_dist = delay_df.groupby('report_delay').onset_dt.count()
        emp_dist = emp_dist.reindex(support, fill_value=0)
        emp_dist /= emp_dist.sum()
        mu = (emp_dist*support).sum()
        var = (emp_dist*(support**2)).sum() - mu**2
        gam = stats.gamma(mu**2 / var, loc=0, scale=(var / mu))
        delay_dist = np.array([gam.cdf(i+1) - gam.cdf(i) for i in support])
        delay_dist /= delay_dist.sum()
        delay_dists[working_onset_date] = np.r_[0, delay_dist] # Add pr 0 at lag=0
        
    pickle.dump(delay_dists, open(f'{storage_dir}/delay_distribution_as_of_{as_of}.p', 'wb'))

In [None]:
a = pickle.load(open(f'{storage_dir}/delay_distribution_as_of_{as_of}.p', 'rb'))
plt.plot(np.r_[0, emp_dist], label="empirical")
plt.plot(delay_dists[working_onset_date], label="gam from pmf")
plt.legend()
plt.show()

### Add extra past

- For each working date `s` older than `d` days, we have fully observed all the possible reporting dates (no need to truncate by report date). Hence, we can simply take the rows where the symptom onset date falls in `[s - 2*d + 1, s]`. We first construct all of these pmfs, and then fill in the extra past for the training kernels.

In [None]:
fair_df = linelist[linelist.cdc_report_dt.lt(Config.ground_truth_date)]
fully_observed_pmfs = {}

In [None]:
for run_date in tqdm(date_range(date(2020, 5, 1), Config.ground_truth_date)):
    t = run_date.date()
    if t in fully_observed_pmfs.keys():
        continue

        
    min_date = t - timedelta(window_size) + timedelta(1)
    max_date = t
    delay_df = fair_df[fair_df.onset_dt.ge(min_date) & fair_df.onset_dt.le(max_date)]
    assert delay_df.cdc_report_dt.max() <= t + timedelta(d)
    
    # Calculate empirical distribution
    emp_dist = delay_df.groupby('report_delay').onset_dt.count()
    emp_dist = emp_dist.reindex(support, fill_value=0)
    emp_dist /= emp_dist.sum()
    mu = (emp_dist*support).sum()
    var = (emp_dist*(support**2)).sum() - mu**2
    gam = stats.gamma(mu**2 / var, loc=0, scale=(var / mu))
    delay_dist = np.array([gam.cdf(i+1) - gam.cdf(i) for i in support])
    delay_dist /= delay_dist.sum()
    fully_observed_pmfs[t] = np.r_[0, delay_dist] # Add pr 0 at lag=0

In [None]:
# Fill in delay distribution pickles
first_data_date = date(2020, 5, 1)
for run_date in tqdm(Config.as_of_range):
    try:
        pmfs = pickle.load(open(f'{storage_dir}/delay_distribution_as_of_{run_date}.p', 'rb'))
    except Exception as e:
        print(run_date, "missing")
        continue

    first_uncensored_date = run_date - timedelta(d+1) - timedelta(1)
    if first_uncensored_date <= first_data_date:
        print(run_date)
        continue 
    
    uncensored_range = [d.date() for d in date_range(first_data_date, first_uncensored_date)]
    existing_working_dates = sorted(pmfs.keys())
    assert run_date == existing_working_dates[-1]
    
    for working_onset_date in uncensored_range:
        pmfs[working_onset_date] = fully_observed_pmfs[working_onset_date]
        
    assert len(pmfs) == (run_date - first_data_date).days + 1
    pickle.dump(pmfs, 
                open(f'{storage_dir}/delay_distribution_as_of_{run_date}.p', 'wb'), 
                protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
pickle.dump(fully_observed_pmfs, open(f'{storage_dir}/uncensored_delay_distribution.p', 'wb'))