### Compile Final Results

Given that we've selected the optimal $\sigma$, let's grab the precalculated posterior corresponding to that value of $\sigma$ for each state. Let's also calculate the 90% and 50% highest density intervals (this takes a little while) and also the most likely value.

In [1]:

def get_posteriors(sr, sigma=0.15):

    # (1) Calculate Lambda
    lam = sr[:-1].values * np.exp(GAMMA * (r_t_range[:, None] - 1))

    
    # (2) Calculate each day's likelihood
    likelihoods = pd.DataFrame(
        data = sps.poisson.pmf(sr[1:].values, lam),
        index = r_t_range,
        columns = sr.index[1:])
    
    # (3) Create the Gaussian Matrix
    process_matrix = sps.norm(loc=r_t_range,
                              scale=sigma
                             ).pdf(r_t_range[:, None]) 

    # (3a) Normalize all rows to sum to 1
    process_matrix /= process_matrix.sum(axis=0)
    
    # (4) Calculate the initial prior
    prior0 = sps.gamma(a=4).pdf(r_t_range)
    prior0 /= prior0.sum()

    # Create a DataFrame that will hold our posteriors for each day
    # Insert our prior as the first posterior.
    posteriors_df = pd.DataFrame(
        index=r_t_range,
        columns=sr.index,
        data={sr.index[0]: prior0}
    )
    
    # We said we'd keep track of the sum of the log of the probability
    # of the data for maximum likelihood calculation.
    log_likelihood = 0.0

    # (5) Iteratively apply Bayes' rule
    for previous_day, current_day in zip(sr.index[:-1], sr.index[1:]):

        #(5a) Calculate the new prior
        current_prior = process_matrix @ posteriors_df[previous_day]
        
        #(5b) Calculate the numerator of Bayes' Rule: P(k|R_t)P(R_t)
        numerator = likelihoods[current_day] * current_prior
        
        #(5c) Calcluate the denominator of Bayes' Rule P(k)
        denominator = np.sum(numerator)
        
        # Execute full Bayes' Rule
        posteriors_df[current_day] = numerator/denominator
        
        # Add to the running sum of log likelihoods
        log_likelihood += np.log(denominator)
    
    return posteriors_df, log_likelihood

In [2]:

def prepare_cases(cases, cutoff=25):
    new_cases = cases.diff()

    smoothed = new_cases.rolling(7,
        win_type='gaussian',
        min_periods=1,
        center=True).mean(std=2).round()
    
    idx_start = np.searchsorted(smoothed, cutoff)
    
    smoothed = smoothed.iloc[idx_start:]
    original = new_cases.loc[smoothed.index]
    
    return original, smoothed

In [4]:

import numpy as np
import pandas as pd

# We create an array for every possible value of Rt
R_T_MAX = 12
r_t_range = np.linspace(0, R_T_MAX, R_T_MAX*100+1)

# Gamma is 1/serial interval
# https://wwwnc.cdc.gov/eid/article/26/7/20-0282_article
# https://www.nejm.org/doi/full/10.1056/NEJMoa2001316
GAMMA = 1/7

FILTERED_REGION_CODES = ['AS', 'GU', 'PR', 'VI', 'MP']
url = 'https://covidtracking.com/api/v1/states/daily.csv'
states = pd.read_csv(url,
                     usecols=['date', 'state', 'positive'],
                     parse_dates=['date'],
                     index_col=['state', 'date'],
                     squeeze=True).sort_index()

In [5]:

from scipy import stats as sps
from IPython.display import clear_output

sigmas = np.linspace(1/20, 1, 20)
targets = ~states.index.get_level_values('state').isin(FILTERED_REGION_CODES)
states_to_process = states.loc[targets]
results = {}
for state_name, cases in states_to_process.groupby(level='state'):
    print(state_name)
    new, smoothed = prepare_cases(cases, cutoff=25)
    if len(smoothed) == 0:
        new, smoothed = prepare_cases(cases, cutoff=10)
    result_dict = {}
    
    # Holds all posteriors with every given value of sigma
    result_dict['posteriors'] = []
    
    # Holds the log likelihood across all k for each value of sigma
    result_dict['log_likelihoods'] = []
    
    for sigma in sigmas:
        posteriors, log_likelihood = get_posteriors(smoothed, sigma=sigma)
        result_dict['posteriors'].append(posteriors)
        result_dict['log_likelihoods'].append(log_likelihood)
    
    # Store all results keyed off of state name
    results[state_name] = result_dict
    clear_output(wait=True)

print('Done.')

Done.


In [6]:

# Each index of this array holds the total of the log likelihoods for
# the corresponding index of the sigmas array.
total_log_likelihoods = np.zeros_like(sigmas)

# Loop through each state's results and add the log likelihoods to the running total.
for state_name, result_series in results.items():
    total_log_likelihoods += result_series['log_likelihoods']

# Select the index with the largest log likelihood total
max_likelihood_index = total_log_likelihoods.argmax()

In [7]:

def highest_density_interval(pmf, p=.9):
    
    # If we pass a DataFrame, just call this recursively on the columns
    if(isinstance(pmf, pd.DataFrame)):
        
        return pd.DataFrame([highest_density_interval(pmf[col], p=p) for col in pmf],
                            index=pmf.columns)
    
    cumsum = np.cumsum(pmf.values)
    
    # N x N matrix of total probability mass for each low, high
    total_p = cumsum - cumsum[:, None]
    
    # Return all indices with total_p > p
    lows, highs = (total_p > p).nonzero()
    if (lows.size == 0) or (highs.size == 0):
    
        return pd.Series([np.nan, np.nan], index=[f'Low_{p*100:.0f}', f'High_{p*100:.0f}'], dtype='float64')
    
    else:
        
        # Find the smallest range (highest density)
        best = (highs - lows).argmin()

        low = pmf.index[lows[best]]
        high = pmf.index[highs[best]]

        return pd.Series([low, high],
                         index=[f'Low_{p*100:.0f}',
                                f'High_{p*100:.0f}'])

In [8]:

final_results = None

for state_name, result in results.items():
    print(state_name)
    posteriors = result['posteriors'][max_likelihood_index]
    hdis_90 = highest_density_interval(posteriors, p=.9)
    hdis_50 = highest_density_interval(posteriors, p=.5)
    most_likely = posteriors.idxmax().rename('ML')
    result_df = pd.concat([most_likely, hdis_90, hdis_50], axis=1)
    mask_series = result_df.Low_90.isnull() | result_df.High_90.isnull()
    result_df = result_df[~mask_series]
    if final_results is None:
        final_results = result_df
    else:
        final_results = pd.concat([final_results, result_df])
    clear_output(wait=True)

print('Done.')

Done.


In [11]:

%run ../../load_magic/storage.py
s = Storage()
states_stats_df = s.load_object('states_stats_df')
HERD_IMMUNITY_THRESHOLD = 0.6

In [12]:

final_results['DuFI'] = np.nan
for index_tuple, row_series in final_results.iterrows():
    state_name = index_tuple[0]
    state_mask_series = (states.index.get_level_values('state') == state_name)
    date_mask_series = (states.index.get_level_values('date') == index_tuple[1])
    mask_series = state_mask_series & date_mask_series
    if states[mask_series].shape[0] > 0:
        case_rate = int(states[mask_series].squeeze())
        date_mask_series = (states.index.get_level_values('date') <= index_tuple[1])
        mask_series = state_mask_series & date_mask_series
        total_cases = int(states[mask_series].cumsum()[-1])
        if state_name == 'DC':
            population = 601_723
        else:
            mask_series = (states_stats_df.State_Abbreviation == state_name)
            population = states_stats_df[mask_series].Census_Population_2010.squeeze()
        days_until_full_infection = int((population-total_cases)/case_rate * HERD_IMMUNITY_THRESHOLD)
        final_results.loc[index_tuple, 'DuFI'] = days_until_full_infection

In [13]:

s.store_objects(final_results=final_results)

Pickling to D:\Documents\Repositories\notebooks\covid19\saves\pickle\final_results.pickle


In [14]:

final_results

Unnamed: 0_level_0,Unnamed: 1_level_0,ML,Low_90,High_90,Low_50,High_50,DuFI
state,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AK,2020-03-25,3.00,0.86,6.84,1.86,4.27,10143.0
AK,2020-03-26,2.72,0.99,4.77,1.87,3.46,7220.0
AK,2020-03-27,2.39,0.90,3.98,1.66,2.95,6173.0
AK,2020-03-28,2.09,0.75,3.46,1.44,2.57,5010.0
AK,2020-03-29,1.90,0.62,3.14,1.35,2.39,4174.0
...,...,...,...,...,...,...,...
WY,2020-05-06,0.51,0.00,1.47,0.19,0.85,547.0
WY,2020-05-07,0.53,0.00,1.52,0.14,0.83,523.0
WY,2020-05-08,0.61,0.00,1.60,0.21,0.94,519.0
WY,2020-05-09,0.63,0.00,1.63,0.25,0.99,511.0
