In [None]:
from simple_growth_rate_estimation import *
from linear_reg_nb import *

import pandas as pd
import numpy as np
from patsy import dmatrices
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import time
import pickle
import warnings
warnings.filterwarnings("ignore")
# warnings.resetwarnings()

# autoreload the imported modules
%load_ext autoreload
%autoreload 2
# put this in the cell: %reload_ext autoreload

# send notification once done running
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)
# put this in the cell: %%notify

For US state level and county level

Try the simple estimation method:

For a curve,

1. cut it up manually on 2020-05-01 and 2020-09-01 (They seem to be good cut-off dates that separate the peaks)

2. take the increasing part in each segment (the orange)

3. run both OLS and negative binomial regression on each orange piece, and decide whether it’s linear or exponential based on likelihood

This is pretty much going back to the old method. Our problems with that method before were:

1. the estimated growth rates depend on the duration, but the new methods (recursive partitioning or dynamic programming) still don’t solve it
2. some growths are linear and some are exponential and we were mixing them together, and now we try both OLS and neg. binomial and choose the suitable one.

# Data processing

In [None]:
# county data
county_cumsum = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")

county_lst = county_cumsum.groupby(["state","county"]).size().reset_index().rename(columns={0:'county_idx'})
county_lst["county_idx"] = county_lst.index
county_cumsum = pd.merge(county_cumsum,county_lst, on=["state","county"])
county_cumsum.head()

counties = {}
counties_smoothed = {}
arrival_dates = pd.Series()
for i in range(len(county_lst)): # number of counties
    name = county_lst["state"][i] + " " + county_lst["county"][i].lower()
    temp = county_cumsum[county_cumsum["county_idx"]==i]
    temp["date"] = pd.to_datetime(temp['date'])
    temp = temp[temp["date"]<=pd.Timestamp("2020-12-31")]
    temp["diff_time"] = (temp["date"]-pd.Timestamp("2020-01-01")).dt.days
    temp["di_cases"] = temp["cases"].diff()
    temp["di_deaths"] = temp["deaths"].diff()
    temp.loc[temp["di_cases"]<0,"di_cases"] = 0
    temp.loc[temp["di_deaths"]<0,"di_deaths"] = 0
    temp = temp[1:].reset_index(drop=True)
    # transform to 7-day average curve
    temp_smoothed = temp.copy()
    temp_smoothed["di_cases"] = temp_smoothed["di_cases"].rolling(7, min_periods=1).mean().round()
    temp_smoothed["di_deaths"] = temp_smoothed["di_deaths"].rolling(7, min_periods=1).mean().round()
    # arrival time
    infection_dates = temp.loc[temp["di_cases"]!=0,'date']
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates.iloc[0]
    else:
        arrival_date = np.nan
    arrival_dates[name] = arrival_date
    counties[name] = temp
    counties_smoothed[name] = temp_smoothed

In [None]:
# import state data
# https://github.com/nytimes/covid-19-data
state_cumsum = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

states = {}
states_smoothed = {}

state_lst = pd.unique(state_cumsum["state"])
arrival_dates_s = pd.Series()
for i in range(len(state_lst)):
    name = state_lst[i]
    if name in ["Virgin Islands","Guam","Northern Mariana Islands","Puerto Rico"]:
        continue
    temp = state_cumsum[state_cumsum["state"]==name]
    temp["date"] = pd.to_datetime(temp['date'])
    temp["diff_time"] = (temp["date"]-pd.Timestamp("2020-01-01")).dt.days
    temp["di_cases"] = temp["cases"].diff()
    temp["di_deaths"] = temp["deaths"].diff()
    temp.loc[temp["di_cases"]<0,"di_cases"] = 0
    temp.loc[temp["di_deaths"]<0,"di_deaths"] = 0
    # remove first row because of the diff() function
    temp = temp[1:].reset_index(drop=True)
    # start from arrival date (case > 0)
    infection_dates = temp.loc[temp["cases"]!=0,"diff_time"]
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        temp = temp[temp["diff_time"] >= infection_dates[0]]
    else:
        continue
    temp = temp.reset_index(drop=True)
    # transform to 7-day average curve
    temp_smoothed = temp.copy()
    temp_smoothed["di_cases"] = temp_smoothed["di_cases"].rolling(7, min_periods=1).mean().round()
    temp_smoothed["di_deaths"] = temp_smoothed["di_deaths"].rolling(7, min_periods=1).mean().round()
    # arrival time
    infection_dates = temp.loc[temp["di_cases"]!=0,'date']
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates.iloc[0]
    else:
        arrival_date = np.nan
    arrival_dates_s[name] = arrival_date
    states[name] = temp
    states_smoothed[name] = temp_smoothed

In [None]:
# # plot the original curve
# for k,v in states.items():
#     plt.figure(figsize=[12,10])
#     plt.plot(v["date"],v["di_cases"],color="blue")
#     plt.axvline(pd.Timestamp("2020-05-01"),color="black",alpha=0.5)
#     plt.axvline(pd.Timestamp("2020-09-01"),color="black",alpha=0.5)
#     plt.title(k)
#     plt.show()
    
# # seems like these 2 dates are good cut_off for state data

In [None]:
# import nation data
nation = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")
nation = nation[:373] # only to 2021-01-27
nation["date"] = pd.to_datetime(nation['date'])
nation["diff_time"] = (nation["date"]-pd.Timestamp("2020-01-01")).dt.days
nation["di_cases"] = nation["cases"].diff()
nation["di_deaths"] = nation["deaths"].diff()
nation.loc[nation["di_cases"]<0,"di_cases"] = 0
nation.loc[nation["di_deaths"]<0,"di_deaths"] = 0
# remove first row because of the diff() function
nation = nation[1:].reset_index(drop=True)

In [None]:
# # plot nation data
# plt.figure(figsize=[12,10])
# plt.plot(nation["date"],nation["di_cases"],color="blue")
# plt.axvline(pd.Timestamp("2020-05-01"),color="black",alpha=0.5)
# plt.axvline(pd.Timestamp("2020-09-01"),color="black",alpha=0.5)
# plt.title("nation")
# plt.show()

## Cut curves into 3 parts

In [None]:
# county
counties_3parts = {}
for k,v in counties.items():
    counties_3parts[k] = {}
    for i in range(3):
        counties_3parts[k][1] = v[v["date"] < pd.Timestamp("2020-05-01")]
        counties_3parts[k][2] = v[v["date"] < pd.Timestamp("2020-09-01")]
        counties_3parts[k][2] = counties_3parts[k][2][counties_3parts[k][2]["date"] >= pd.Timestamp("2020-05-01")]
        counties_3parts[k][3] = v[v["date"] >= pd.Timestamp("2020-09-01")]

In [None]:
# cut the entire curve into 3 parts
states_3parts = {}
for k,v in states.items():
    states_3parts[k] = {}
    for i in range(3):
        states_3parts[k][1] = v[v["date"] < pd.Timestamp("2020-05-01")]
        states_3parts[k][2] = v[v["date"] < pd.Timestamp("2020-09-01")]
        states_3parts[k][2] = states_3parts[k][2][states_3parts[k][2]["date"] >= pd.Timestamp("2020-05-01")]
        states_3parts[k][3] = v[v["date"] >= pd.Timestamp("2020-09-01")]

In [None]:
# modify the results for Delaware and North Dakota
# Delaware: 2nd piece not detected - min occur too late - cut off earlier to get rid of the valley part around 09-01
states_3parts["Delaware"][2] = states_3parts["Delaware"][2][states_3parts["Delaware"][2]["date"] < pd.Timestamp("2020-08-01")]

# North Dakota: 3rd piece not detected - min occur too late
states_3parts["North Dakota"][3] = states_3parts["North Dakota"][3][states_3parts["North Dakota"][3]["date"] < pd.Timestamp("2020-12-01")]


In [None]:
# nation - same cut off
nation_3parts = {}

nation_3parts[1] = nation[nation["date"] < pd.Timestamp("2020-05-01")]
nation_3parts[2] = nation[nation["date"] < pd.Timestamp("2020-09-01")]
nation_3parts[2] = nation_3parts[2][nation_3parts[2]["date"] >= pd.Timestamp("2020-05-01")]
nation_3parts[3] = nation[nation["date"] >= pd.Timestamp("2020-09-01")]

## Shift curve for exp estimation

In [None]:
def curve_shift(curve, var, smoothing=True, n_day_avg=7):
    """
    For a given piece of curve, move the origin to the minimum point of the curve.
    input: curve - a single pd dataframe
           var - variable (column name in pd dataframe) to be regressed on. eg "di_cases","di_deaths","infections"
           first_wave_results - the results got back from function growthRate for estimating first wave growth rates
           smoothing - whether to smooth the curve with n-day moving average before finding the min and argmin
           n_day_avg - only applied when smoothing=True
           * NOTE: the actual returned shifted curve (the one to be used in regression) is still the original raw data.
    """
    plt.figure(figsize=[12,10])
    plt.plot(curve["date"],curve[var],color="blue")
    
    if len(curve) == 0:
        return curve
    
    df = curve.copy()
    df.reset_index(drop=True,inplace=True)
    
    if smoothing:
        df["smoothed"] = df[var].rolling(n_day_avg, min_periods=1).mean().round()
        new_origin_x = df["smoothed"].argmin()
    else:
        new_origin_x = df[var].argmin()
    # find minimum and argmin
    new_origin_y = df.loc[new_origin_x,var]
    # construct new df
    df = df.loc[new_origin_x:,:]
    df[var] = df[var].add(-new_origin_y+1)
    
    # plot
    plt.plot(df["date"],df[var],color="orange")
    plt.axhline(y=0,color="black")
    plt.show()
    
    # if anything is below zero, then we make it zero
    df.loc[df[var]<0,var] = 0
    
    # if the curve is too short
    if len(df) < 20:
        print("Resulted curve smaller than 20 data points!")
        return None
        
    return df

In [None]:
# county
new_counties_3parts = {}
for k,dic in counties_3parts.items():
    print(k)
    new_counties_3parts[k] = {}
    new_counties_3parts[k][1] = counties_3parts[k][1]
    for i in range(1,4):
        new_counties_3parts[k][i] = curve_shift(dic[i],"di_cases")
        
"""
if the curve is too flat, we wouldn't want to estimate it. 
Can be tuned by min_infections:
    1st - 100 min infections
    2nd - larger
can also set a threshold of standard deviation for a piece of curve
"""

In [None]:
# state
# for each piece, shift the curve so it starts from the min point
new_curves_3parts = {}
for k,dic in states_3parts.items():
    print(k)
    new_curves_3parts[k] = {}
    new_curves_3parts[k][1] = states_3parts[k][1]
    for i in range(1,4):
        new_curves_3parts[k][i] = curve_shift(dic[i],"di_cases")
        

In [None]:
# nation
new_nation_3parts = {}
new_nation_3parts[1] = nation_3parts[1]
new_nation_3parts[2] = curve_shift(nation_3parts[2],"di_cases")
new_nation_3parts[3] = curve_shift(nation_3parts[3],"di_cases")

# Estimate growth rate

In [None]:
# trial with one state

%reload_ext autoreload

exp_res = growth_rate(new_curves_3parts["Washington"][3],"di_cases")
# take only the part that results in a best exp model - from the start to the cut off
cut_off = exp_res["cut_off"]
best_exp_piece = new_curves_3parts["Washington"][3][new_curves_3parts["Washington"][3]["date"]<=cut_off]
# see if a linear OLS model can do better than the best exp model
lin_res = linear(best_exp_piece,"di_cases",False)
lin_res["cut_off"] = cut_off
if lin_res["max_ll"] > exp_res["max_ll"]:
    print("Tis piece is linear")
    lin_res["cut_off"] = cut_off


## OLS for linear reg.

In [None]:
# estimate growth rate for each state each part, OLS for linear regression
%reload_ext autoreload

growth_rates = {}
for k,dic in new_curves_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(states[k]["date"],states[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_curves_3parts[k][i] is None:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_curves_3parts[k][i],"di_cases")
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_curves_3parts[k][i][new_curves_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],new_curves_3parts[k][i]["di_cases"][new_curves_3parts[k][i]["date"] <= cut_off],color="orange",label="shifted data")
        
        # see if a linear OLS model can do better than the best exp model
        lin_res = linear_OLS(best_exp_piece,"di_cases",False)
        lin_res["cut_off"] = cut_off
        if lin_res["max_ll"] > exp_res["max_ll"]:
            print("This piece is linear:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
            
            # growth_rates[(k,i)] = lin_res
        else:
            print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
        growth_rates[(k,i)] = exp_res
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],lin_res["prediction"],color="red",label="linear prediction")
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
        
    plt.legend()
    plt.show()

## Self-defined linear regression with NB distribution

GenericLikelihoodModel (statsmodels.base.model)

In [None]:
# estimate growth rate for each piece, self-defined linear_NBin regresison
# linear regression version for negative binomial distributed data
%reload_ext autoreload

growth_rates = {}
for k,dic in new_curves_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(states[k]["date"],states[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_curves_3parts[k][i] is None:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_curves_3parts[k][i],"di_cases")
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_curves_3parts[k][i][new_curves_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],new_curves_3parts[k][i]["di_cases"][new_curves_3parts[k][i]["date"] <= cut_off],color="orange",label="shifted data")
        
        # see if a linear OLS model can do better than the best exp model
        lin_res = linear_NB_selfdefined(best_exp_piece,"di_cases",False)
        lin_res["cut_off"] = cut_off
        if lin_res["max_ll"] > exp_res["max_ll"]:
            print("This piece is linear:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
            
            # growth_rates[(k,i)] = lin_res
            plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],lin_res["prediction"],color="red",label="linear prediction")
        
        else:
            print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
        growth_rates[(k,i)] = exp_res
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
        
    plt.legend()
    plt.show()

## sm pre-defined linear regression with NB distribution

use sm.GLM(y,x,family = sm.families.NegativeBinomial(link = identity, alpha = alpha))

(the classic NB uses Log link)

this way we can self-input alpha

In [None]:
# estimate growth rate for each piece, self-defined linear_NBin regresison
# linear regression version for negative binomial distributed data
%reload_ext autoreload

growth_rates = {}
for k,dic in new_curves_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(states[k]["date"],states[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_curves_3parts[k][i] is None:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_curves_3parts[k][i],"di_cases")
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_curves_3parts[k][i][new_curves_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],new_curves_3parts[k][i]["di_cases"][new_curves_3parts[k][i]["date"] <= cut_off],color="orange",label="shifted data")
        
        # see if a linear OLS model can do better than the best exp model
        
        # do this only for 2nd and 3rd piece. we always want exp for the initial piece
        if i > 1:
            alpha = exp_res["alpha"]
            lin_res = linear_NB(best_exp_piece,"di_cases",alpha,False)
            lin_res["cut_off"] = cut_off
            if lin_res["max_ll"] > exp_res["max_ll"]:
                print("This piece is linear:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
                growth_rates[(k,i)] = lin_res
                plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],lin_res["prediction"],color="red",label="linear prediction")

            else:
                print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
                growth_rates[(k,i)] = exp_res
            plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
        else:
            print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"])
            growth_rates[(k,i)] = exp_res
            plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
    plt.legend()
    plt.show()

In [None]:
## DON'T use ##
# try to raise the treshold for switching to linear
%reload_ext autoreload

growth_rates = {}
for k,dic in new_curves_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(states[k]["date"],states[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_curves_3parts[k][i] is None:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_curves_3parts[k][i],"di_cases")
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_curves_3parts[k][i][new_curves_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],new_curves_3parts[k][i]["di_cases"][new_curves_3parts[k][i]["date"] <= cut_off],color="orange",label="shifted data")
        
        # see if a linear OLS model can do better than the best exp model
        
        # do this only for 2nd and 3rd piece. we always want exp for the initial piece
        if i > 1:
            alpha = exp_res["alpha"]
            lin_res = linear_NB(best_exp_piece,"di_cases",alpha,False)
            lin_res["cut_off"] = cut_off
            if lin_res["max_ll"]*1.01 > exp_res["max_ll"]: # 5% higher
                print("This piece is linear:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
                growth_rates[(k,i)] = lin_res
                plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],lin_res["prediction"],color="red",label="linear prediction")

            else:
                print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
                growth_rates[(k,i)] = exp_res
            plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
        else:
            print("This piece is exp:",k,i,"exp ll=",exp_res["max_ll"])
            growth_rates[(k,i)] = exp_res
            plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
    plt.legend()
    plt.show()

In [None]:
# save results
with open("simple_growth_rate_results_20210128.csv","wb") as f:
    pickle.dump(growth_rates,f)

In [None]:
# nation
plt.figure(figsize=[12,10])
plt.plot(nation["date"],nation["di_cases"],color="blue",label="original data")

nation_growth_rates = {}
for i in range(1,4):
    if new_nation_3parts[i] is None:
        continue

    # estimate the best-fit exponential model
    exp_res = growth_rate(new_nation_3parts[i],"di_cases")

    # take only the part that results in a best exp model - from the start to the cut off
    cut_off = exp_res["cut_off"]
    best_exp_piece = new_nation_3parts[i][new_nation_3parts[i]["date"] <= cut_off]

    # plot
    plt.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],
             new_nation_3parts[i]["di_cases"][new_nation_3parts[i]["date"] <= cut_off],
             color="orange",label="data used for regression")

    # see if a linear OLS model can do better than the best exp model

    # do this only for 2nd and 3rd piece. we always want exp for the initial piece
    if i > 1:
        alpha = exp_res["alpha"]
        lin_res = linear_NB(best_exp_piece,"di_cases",alpha,False)
        lin_res["cut_off"] = cut_off
        if lin_res["max_ll"] > exp_res["max_ll"]:
            print("This piece is linear:",i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
            nation_growth_rates[i] = lin_res
            plt.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],lin_res["prediction"],color="red",label="linear prediction")

        else:
            print("This piece is exp:",i,"exp ll=",exp_res["max_ll"],"linear ll=",lin_res["max_ll"])
            nation_growth_rates[i] = exp_res
        plt.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
    else:
        print("This piece is exp:",i,"exp ll=",exp_res["max_ll"])
        nation_growth_rates[i] = exp_res
        plt.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],exp_res["prediction"],color="red",label="fitting")
plt.legend()
plt.show()

## Only using exp NB reg - USE THIS ONE

In [None]:
# counties
%reload_ext autoreload

growth_rates_c = {}
for k,dic in new_counties_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(counties[k]["date"],counties[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_counties_3parts[k][i] is None:
            continue
        if len(new_counties_3parts[k][i]) == 0:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_counties_3parts[k][i],"di_cases",min_total_infected = i*20)
        if exp_res is None:
            continue
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_counties_3parts[k][i][new_counties_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_counties_3parts[k][i]["date"][new_counties_3parts[k][i]["date"] <= cut_off],
                 new_counties_3parts[k][i]["di_cases"][new_counties_3parts[k][i]["date"] <= cut_off],
                 color="orange",label="shifted data")
        
        growth_rates_c[(k,i)] = exp_res
        plt.plot(new_counties_3parts[k][i]["date"][new_counties_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
    plt.legend()
    plt.show()

In [None]:
# states

%reload_ext autoreload

growth_rates = {}
for k,dic in new_curves_3parts.items():
    print(k)
    
    plt.figure(figsize=[12,10])
    plt.plot(states[k]["date"],states[k]["di_cases"],color="blue",label="original data")
    
    for i in range(1,4):
        if new_curves_3parts[k][i] is None:
            continue
            
        # estimate the best-fit exponential model
        exp_res = growth_rate(new_curves_3parts[k][i],"di_cases")
        
        # take only the part that results in a best exp model - from the start to the cut off
        cut_off = exp_res["cut_off"]
        best_exp_piece = new_curves_3parts[k][i][new_curves_3parts[k][i]["date"] <= cut_off]
        
        # plot
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],new_curves_3parts[k][i]["di_cases"][new_curves_3parts[k][i]["date"] <= cut_off],color="orange",label="shifted data")
        
        growth_rates[(k,i)] = exp_res
        plt.plot(new_curves_3parts[k][i]["date"][new_curves_3parts[k][i]["date"] <= cut_off],exp_res["prediction"],color="red",label="exp prediction")
    plt.legend()
    plt.show()

In [None]:
# nation
%reload_ext autoreload

plt.rcParams.update({'font.size': 16})

fig,ax = plt.subplots(figsize=[15,6])
l1 = ax.plot(nation["date"],nation["di_cases"],color="blue",label="original data")
nation_growth_rates = {}
for i in range(1,2):
    # estimate the best-fit exponential model
    exp_res = growth_rate(new_nation_3parts[i],"di_cases",min_total_infected = i*20)
    # take only the part that results in a best exp model - from the start to the cut off
    cut_off = exp_res["cut_off"]
    best_exp_piece = new_nation_3parts[i][new_nation_3parts[i]["date"] <= cut_off]

    # plot
    l2 = ax.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],
             new_nation_3parts[i]["di_cases"][new_nation_3parts[i]["date"] <= cut_off],
             color="orange",label="data for regression")
    
    nation_growth_rates[i] = exp_res
    l3 = ax.plot(new_nation_3parts[i]["date"][new_nation_3parts[i]["date"] <= cut_off],exp_res["prediction"],color="red",label="fitting")
# ax.legend()
ax.set_xlim((pd.Timestamp("2020-02-20"),pd.Timestamp("2021-01-01")))
ax.set_xticklabels(["Mar.","Apr.","May","Jun.","Jul.","Aug.","Sep.","Oct.","Nov.","Dec."])
ax.set_yticklabels([5,10,15,20,25,30])
ax.set_ylabel("Daily New Infections in USA ($10^4$)")
plt.savefig("demo.png",transparent=True)

In [None]:
# make the new results into a pd dataframe
# county
exp_growth_rates_table = pd.DataFrame(index=list(new_counties_3parts.keys()),columns=["1st","2nd","3rd"])
for k in new_counties_3parts.keys():
    try:
        exp_growth_rates_table.loc[k,"1st"] = growth_rates_c[(k,1)]["growth_rate"]
    except:
        pass
    try:
        if growth_rates_c[(k,2)]["model_type"] == "exponential":
            exp_growth_rates_table.loc[k,"2nd"] = growth_rates_c[(k,2)]["growth_rate"]
    except:
        pass
    try:
        if growth_rates_c[(k,3)]["model_type"] == "exponential":
            exp_growth_rates_table.loc[k,"3rd"] = growth_rates_c[(k,3)]["growth_rate"]
    except:
        pass
exp_growth_rates_table  

In [None]:
len(exp_growth_rates_table) - exp_growth_rates_table.isnull().sum()
exp_growth_rates_table.to_csv("exp_growth_rates_counties_20210407.csv")

In [None]:
# states
# # make the new results into a pd dataframe
exp_growth_rates_table = pd.DataFrame(index=list(new_curves_3parts.keys()),columns=["start_date_1","end_date_1","start_date_2","end_date_2","start_date_3","end_date_3","GR_1","GR_2","GR_3"])
for k in new_curves_3parts.keys():
    print(k)
    exp_growth_rates_table.loc[k,"start_date_1"] = growth_rates[(k,1)]["start_date"]
    exp_growth_rates_table.loc[k,"end_date_1"] = growth_rates[(k,1)]["cut_off"]
    exp_growth_rates_table.loc[k,"GR_1"] = growth_rates[(k,1)]["growth_rate"]
    try:
        if growth_rates[(k,2)]["model_type"] == "exponential":
            exp_growth_rates_table.loc[k,"GR_2"] = growth_rates[(k,2)]["growth_rate"]
            exp_growth_rates_table.loc[k,"start_date_2"] = growth_rates[(k,2)]["start_date"]
            exp_growth_rates_table.loc[k,"end_date_2"] = growth_rates[(k,2)]["cut_off"]
    except:
        pass
    try:
        if growth_rates[(k,3)]["model_type"] == "exponential":
            exp_growth_rates_table.loc[k,"GR_3"] = growth_rates[(k,3)]["growth_rate"]
            exp_growth_rates_table.loc[k,"start_date_3"] = growth_rates[(k,3)]["start_date"]
            exp_growth_rates_table.loc[k,"end_date_3"] = growth_rates[(k,3)]["cut_off"]
    except:
        pass
exp_growth_rates_table

In [None]:
# save
exp_growth_rates_table.to_csv("exp_growth_rates_states_20210210.csv")

In [None]:
# save
with open("exp_growth_rates_nation_20210422.csv","wb") as f:
    pickle.dump(nation_growth_rates,f)

# Compare with old method

In [None]:
# # old results
# s_i_1_old = pd.read_csv("s_i_1_growth_rates.csv")
# s_i_2_old = pd.read_csv("s_i_2_growth_rates.csv")
# s_i_1_old.set_index("Unnamed: 0",inplace=True)
# s_i_2_old.set_index("Unnamed: 0",inplace=True)

In [None]:
# # states
# # make the new results into a pd dataframe
# exp_growth_rates_s = pd.DataFrame(index=list(new_curves_3parts.keys()),columns=["old_1","old_2","new_1","new_2","new_3"])
# for k in new_curves_3parts.keys():
#     exp_growth_rates_s.loc[k,"old_1"] = s_i_1_old.loc[k,"Growth Rate"]
#     exp_growth_rates_s.loc[k,"new_1"] = growth_rates[(k,1)]["growth_rate"]
#     try:
#         exp_growth_rates_s.loc[k,"old_2"] = s_i_2_old.loc[k,"Growth Rate"]
#     except:
#         pass
#     try:
#         if growth_rates[(k,2)]["model_type"] == "exponential":
#             exp_growth_rates_s.loc[k,"new_2"] = growth_rates[(k,2)]["growth_rate"]
#     except:
#         pass
#     try:
#         if growth_rates[(k,3)]["model_type"] == "exponential":
#             exp_growth_rates_s.loc[k,"new_3"] = growth_rates[(k,3)]["growth_rate"]
#     except:
#         pass
# exp_growth_rates_s  

In [None]:
# # plot old and new estimations
# fig, ax = plt.subplots(figsize=[6.5*2.5,4.5*2.5])
# ax.scatter(exp_growth_rates_table["old_1"],exp_growth_rates_table["new_1"],color="green",label="1st surge")
# ax.scatter(exp_growth_rates_table["old_2"],exp_growth_rates_table["new_2"],color="red",label="2nd surge")
# for i, txt in enumerate(exp_growth_rates_table.index):
#     ax.annotate(txt, (exp_growth_rates_table["old_1"].values[i],exp_growth_rates_table["new_1"].values[i]),fontsize=16,color="green")
# for i, txt in enumerate(exp_growth_rates_table.index):
#     ax.annotate(txt, (exp_growth_rates_table["old_2"].values[i],exp_growth_rates_table["new_2"].values[i]),fontsize=16,color="red")
# plt.plot([0,0.1,0.2,0.4],[0,0.1,0.2,0.4],color="black",label="equal growth rate")
# plt.title("Compare old and new exp estimates",fontsize=24)
# plt.xlabel("Old method",fontsize=24)
# plt.ylabel("New method",fontsize=24)
# plt.legend(fontsize=16,loc="upper left")

# 1st vs 2nd vs 3rd surges

In [None]:
# read from record
GR_s = pd.read_csv("exp_growth_rates_states_20210210.csv",index_col = 0)
# GR_s = GR_s[["GR_1","GR_2","GR_3"]]
GR_s[GR_s < 0] = 0
GR_s

GR_c = pd.read_csv("exp_growth_rates_counties_20210407.csv",index_col=0)
GR_c[GR_c < 0] = 0
GR_c

In [None]:
with open("exp_growth_rates_nation_20210422.csv","rb") as f:
    nation_growth_rates = pickle.load(f)

In [None]:
nation_growth_rates

In [None]:
# counties
# scatter plot
plt.rcParams.update({'font.size': 22})

fig, (ax1,ax2) = plt.subplots(2,figsize=[15,20])

ax1.scatter(exp_growth_rates_table["1st"],exp_growth_rates_table["2nd"],color="blue")
# for i, txt in enumerate(exp_growth_rates_table.index):
#     ax1.annotate(txt, 
#                  (exp_growth_rates_table["1st"].values[i],exp_growth_rates_table["2nd"].values[i]),
#                  fontsize=16,color="blue")

ax2.scatter(exp_growth_rates_table["2nd"],exp_growth_rates_table["3rd"],color="orange")
# for i, txt in enumerate(exp_growth_rates_table.index):
#     ax2.annotate(txt, 
#                  (exp_growth_rates_table["2nd"].values[i],exp_growth_rates_table["3rd"].values[i]),
#                  fontsize=16)

plt.title("Compare exp growth rates between different surges",fontsize=24)
ax1.set_xlabel("1st wave exponential",fontsize=24)
ax1.set_ylabel("2nd wave exponential",fontsize=24)
ax2.set_xlabel("2nd wave exponential",fontsize=24)
ax2.set_ylabel("3rd wave exponential",fontsize=24)
ax1.set_ylim((0,0.3))
ax1.set_xlim((0,0.35))
ax2.set_ylim((0,0.3))
ax2.set_xlim((0,0.35))
ax1.plot([0,0.1,0.2,0.3],[0,0.1,0.2,0.3],color="black",label="equal growth rate")
ax2.plot([0,0.1,0.2,0.3],[0,0.1,0.2,0.3],color="black",label="equal growth rate")
plt.legend()
plt.show()

In [None]:
# distribution plots
# plt.figure(figsize=[8,6])
# plt.hist(GR_c["1st"],30,label="1st exp",alpha=0.8)
# plt.hist(GR_c["2nd"],30,label="2nd exp",alpha=0.6)
# plt.hist(GR_c["3rd"],50,label="3rd exp",alpha=0.5)
# plt.xlim((0,0.45))
# plt.legend()
# plt.xlabel("exponential growth rates")
# plt.ylabel("frequency")

plt.rcParams.update({'font.size': 16})
# density plots
fig = plt.figure(figsize=[5.3,4])

p_1 = sns.distplot(GR_c["1st"],hist=False, kde=True, bins=40, color = "blue",
                   kde_kws={'linewidth': 2,"shade": True},label = "first surge")
p_2 = sns.distplot(GR_c["2nd"],hist=False, kde=True, bins=40, color = "orange",
                   kde_kws={'linewidth': 2,"shade": True},label = "second surge")
p_3 = sns.distplot(GR_c["3rd"],hist=False, kde=True, bins=40, color = "green",
                   kde_kws={'linewidth': 2,"shade": True},label = "third surge")
plt.legend(fontsize=20)
plt.xlabel("County Growth rates",fontname="Arial")
plt.ylabel(" ",fontname="Arial")
plt.xlim((-0.05,0.6))
# plt.xticks(fontsize=20) #20
# plt.yticks(fontsize=20)
plt.tight_layout()

In [None]:
# state
# scatter plot
fig, (ax1,ax2) = plt.subplots(2,figsize=[15,20])

ax1.scatter(exp_growth_rates_table["new_1"],exp_growth_rates_table["new_2"],color="blue",label="1st vs 2nd")
for i, txt in enumerate(exp_growth_rates_table.index):
    ax1.annotate(txt, 
                 (exp_growth_rates_table["new_1"].values[i],exp_growth_rates_table["new_2"].values[i]),
                 fontsize=16,color="blue")

ax2.scatter(exp_growth_rates_table["new_2"],exp_growth_rates_table["new_3"])
for i, txt in enumerate(exp_growth_rates_table.index):
    ax2.annotate(txt, 
                 (exp_growth_rates_table["new_2"].values[i],exp_growth_rates_table["new_3"].values[i]),
                 fontsize=16)

plt.title("Compare exp growth rates between different surges",fontsize=24)
ax1.set_xlabel("1st wave exponential",fontsize=24)
ax1.set_ylabel("2nd wave exponential",fontsize=24)
ax2.set_xlabel("2nd wave exponential",fontsize=24)
ax2.set_ylabel("3rd wave exponential",fontsize=24)
ax2.set_ylim((0,0.3))
ax2.set_xlim((0,0.35))
ax1.plot([0,0.1,0.2,0.3],[0,0.1,0.2,0.3],color="black",label="equal growth rate")
ax2.plot([0,0.1,0.2,0.3],[0,0.1,0.2,0.3],color="black",label="equal growth rate")
plt.legend()
plt.show()

In [None]:
# distribution plots
# plt.figure(figsize=[8,6])
# plt.hist(exp_growth_rates_s["GR_1"],23,label="1st exp",alpha=0.8)
# plt.hist(exp_growth_rates_s["GR_2"],40,label="2nd exp",alpha=0.6)
# plt.hist(exp_growth_rates_s["GR_3"],15,label="3rd exp",alpha=0.6)
# plt.xlim((0,0.42))
# plt.legend()
# plt.xlabel("exponential growth rates")
# plt.ylabel("frequency")

# density plot
fig = plt.figure(figsize=[8,6])

p_1 = sns.distplot(GR_s["GR_1"],hist=False, kde=True, bins=40, color = "blue",
                   kde_kws={'linewidth': 2,"shade": True},label = "first surge")
p_2 = sns.distplot(GR_s["GR_2"],hist=False, kde=True, bins=40, color = "orange",
                   kde_kws={'linewidth': 2,"shade": True},label = "second surge")
p_3 = sns.distplot(GR_s["GR_3"],hist=False, kde=True, bins=40, color = "green",
                   kde_kws={'linewidth': 2,"shade": True},label = "third surge")
# plt.legend(fontsize=20)
plt.xlabel("State Growth rates",fontname="Arial", fontsize=20)
plt.ylabel("PDF",fontname="Arial", fontsize=20)
plt.xlim((-0.05,0.6))
plt.xticks(fontsize=20) #20
plt.yticks(fontsize=20)
plt.tight_layout()

# Spacial Aggregation

In [None]:
# 1st
fig = plt.figure(figsize=[6.5,6.5])
p_c = sns.distplot(GR_c["1st"],hist=False, kde=True, 
             bins=40, color = "blue",
             kde_kws={'linewidth': 2,"shade": True},label = "counties")
p_s = sns.distplot(GR_s["GR_1"],hist=False, kde=True, 
             bins=40, color = "purple",
             kde_kws={'linewidth': 2,"shade": True},label = "states")
p_n = plt.axvline(nation_growth_rates[1]["growth_rate"],color="black",label="nation")
plt.xlabel("First Surge Growth Rate",fontname="Arial", fontsize=20)
plt.ylabel("KDE",fontname="Arial", fontsize=20)
# plt.xlim((0,0.6))
plt.xticks(fontsize=20) #20
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.tight_layout()


# 2nd
fig = plt.figure(figsize=[6.5,6.5])
p_c = sns.distplot(GR_c["2nd"],hist=False, kde=True, 
             bins=40, color = "orange",
             kde_kws={'linewidth': 2,"shade": True},label = "counties")
p_s = sns.distplot(GR_s["GR_2"],hist=False, kde=True, 
             bins=40, color = "orangered",
             kde_kws={'linewidth': 2,"shade": True},label = "states")
p_n = plt.axvline(nation_growth_rates[2]["growth_rate"],color="black",label="nation")
plt.legend(fontsize=20)
plt.xlabel("Second Surge Growth Rate",fontname="Arial", fontsize=20)
plt.ylabel("KDE",fontname="Arial", fontsize=20)
# plt.xlim((0,0.4))
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()


# 3rd
fig = plt.figure(figsize=[6.5,6.5])
p_c = sns.distplot(GR_c["3rd"],hist=False, kde=True, 
             bins=40, color = "darkgreen",
             kde_kws={'linewidth': 2,"shade": True},label = "counties")
p_s = sns.distplot(GR_s["GR_3"],hist=False, kde=True, 
             bins=40, color = "goldenrod",
             kde_kws={'linewidth': 2,"shade": True},label = "states")
p_n = plt.axvline(nation_growth_rates[3]["growth_rate"],color="black",label="nation")
plt.legend(fontsize=20)
plt.xlabel("Third Surge Growth Rate",fontname="Arial", fontsize=20)
plt.ylabel("KDE",fontname="Arial", fontsize=20)
# plt.xlim((0,0.2))
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()


In [None]:
fig,axs = plt.subplots(3, 1)
fig.set_size_inches(8, 6)

axs[0].axis("off")
axs[1].axis("off")
axs[2].spines['right'].set_visible(False)
axs[2].spines['top'].set_visible(False)
axs[2].spines['left'].set_visible(False)
# Only show ticks on the left and bottom spines
axs[2].yaxis.set_ticks_position('none')
axs[2].xaxis.set_ticks_position('bottom')
axs[2].get_yaxis().set_ticks([])

p_c = sns.distplot(GR_c["1st"],hist=False, kde=True, 
                   bins=40, color = "steelblue", 
                   kde_kws={'linewidth': 2,"shade": True},label = "counties",ax=axs[0])
p_s = sns.distplot(GR_s["1st"],hist=False, kde=True, 
                   bins=40, color = "goldenrod", 
                   kde_kws={'linewidth': 2,"shade": True},label = "states",ax=axs[0])
p_n = axs[0].axvline(nation_growth_rates[1]["growth_rate"],color="black",label="nation")
# plt.xlabel("First Surge Growth Rate",fontname="Arial", fontsize=20)
# plt.ylabel("KDE",fontname="Arial", fontsize=20)
axs[0].set_xlim((-0.1,0.45))
# plt.xticks(fontsize=20) #20
# plt.yticks(fontsize=20)
# plt.legend(fontsize=20)
# plt.tight_layout()


# 2nd
p_c = sns.distplot(GR_c["2nd"],hist=False, kde=True, 
             bins=40, color = "steelblue",
             kde_kws={'linewidth': 2,"shade": True},label = "counties",ax=axs[1])
p_s = sns.distplot(GR_s["2nd"],hist=False, kde=True, 
             bins=40, color = "goldenrod",
             kde_kws={'linewidth': 2,"shade": True},label = "states",ax=axs[1])
p_n = axs[1].axvline(nation_growth_rates[2]["growth_rate"],color="black",label="nation")
# plt.legend(fontsize=20)
# plt.xlabel("Second Surge Growth Rate",fontname="Arial", fontsize=20)
# plt.ylabel("KDE",fontname="Arial", fontsize=20)
axs[1].set_xlim((-0.1,0.45))
# plt.xticks(fontsize=20)
# plt.yticks(fontsize=20)
# plt.tight_layout()


# 3rd
p_c = sns.distplot(GR_c["3rd"],hist=False, kde=True, 
             bins=40, color = "steelblue",
             kde_kws={'linewidth': 1.5,"shade": True},label = "counties",ax=axs[2])
p_s = sns.distplot(GR_s["3rd"],hist=False, kde=True, 
             bins=40, color = "goldenrod",
             kde_kws={'linewidth': 1.5,"shade": True},label = "states",ax=axs[2])
p_n = axs[2].axvline(nation_growth_rates[3]["growth_rate"],color="black",label="nation")
plt.legend(fontsize=20,loc=(1.1,0.5))
# plt.xlabel("Third Surge Growth Rate",fontname="Arial", fontsize=20)
# plt.ylabel("KDE",fontname="Arial", fontsize=20)
axs[2].set_xlim((-0.1,0.45))
# plt.xticks(fontsize=20)
# plt.yticks(fontsize=20)
# plt.tight_layout()

In [None]:
# ridge plot

# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(500)
g = np.tile(list("ABCDEFGHIJ"), 50)
df = pd.DataFrame(dict(x=x, g=g))
m = df.g.map(ord)
df["x"] += m

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, height=.5, palette=pal)

# Draw the densities in a few steps
g.map(sns.kdeplot, "x",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
# g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.5)
# g.map(plt.axhline, y=0, lw=2, clip_on=False)


# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)


g.map(label, "x")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

In [None]:
sns.load_dataset("tips")

In [None]:
GR_c_melt = GR_c.melt(value_vars=["1st","2nd","3rd"],var_name="surge",value_name="GR")
GR_c_melt["level"] = "county"

GR_s.columns=["1st","2nd","3rd"]
GR_s_melt = GR_s.melt(value_vars=["1st","2nd","3rd"],var_name="surge",value_name="GR")
GR_s_melt["level"] = "state"
GR_s_melt

GR_melt = GR_c_melt.append(GR_s_melt)
GR_melt.reset_index(drop=True,inplace=True)
GR_melt

In [None]:
plt.figure(figsize=[10,10])

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(6, rot=-.25, light=.7)
pal = sns.color_palette("pastel")
g = sns.FacetGrid(GR_melt, row="surge", hue="level", aspect=3, height=2.5, palette=pal)

# Draw the densities in a few steps
g.map(sns.kdeplot, "GR",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=0.5, linewidth=1.5)

# g.map(sns.kdeplot, "GR", clip_on=False, color="w", lw=2, bw_adjust=.5)
g.map(plt.axhline, y=0, lw=2, clip_on=False)



axes = g.axes.flatten()
# axes[0].set_title("first surge")
# axes[1].set_title("second surge")
# axes[2].set_title("third surge")

axes[0].text(0.3,20, "first surge")
axes[1].text(0.28,20, "second surge")
axes[2].text(0.3,20, "third surge")

axes[0].axvline(nation_growth_rates[1]["growth_rate"],color="black",label="nation")
axes[1].axvline(nation_growth_rates[2]["growth_rate"],color="black",label="nation")
axes[2].axvline(nation_growth_rates[3]["growth_rate"],color="black",label="nation")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=0)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
g.set(xlim=(0,0.34)
g.set(xlabel="Infection Growth Rate")
# g.set(ylabel="PDF")
#g.add_legend()

# Spearman Correlation

In [None]:
# read in growth rates data
i_s = pd.read_csv("exp_growth_rates_states_20210210.csv",index_col = 0)
# reformat because saved differently
i_s = i_s[["GR_1",'GR_2','GR_3']]
i_s.columns = ["1st","2nd","3rd"]
i_c = pd.read_csv("exp_growth_rates_counties_20210407.csv",index_col=0)
with open("exp_growth_rates_nation_20210422.csv","rb") as f:
    i_n_raw = pickle.load(f)
# reformat
i_n = pd.DataFrame(index=["gr",'std_err'],columns=["1st","2nd","3rd"])
i_n.loc['gr','1st'] = i_n_raw[1]["growth_rate"]
i_n.loc['gr','2nd'] = i_n_raw[2]["growth_rate"]
i_n.loc['gr','3rd'] = i_n_raw[3]["growth_rate"]

In [None]:
d_s = pd.read_csv("exp_growth_rates_deaths_states_20210504.csv",index_col = 0)
d_c = pd.read_csv("exp_growth_rates_deaths_counties_20210504.csv",index_col=0)
with open("exp_growth_rates_deaths_nation_20210504.csv","rb") as f:
    d_n_raw = pickle.load(f)
# reformat
d_n = pd.DataFrame(index=["nation"],columns=["1st","2nd","3rd"])
d_n.loc['gr','1st'] = d_n_raw[1]["growth_rate"]
d_n.loc['gr','2nd'] = d_n_raw[2]["growth_rate"]
d_n.loc['gr','3rd'] = d_n_raw[3]["growth_rate"]

In [None]:
# population density
popu = pd.read_csv('all_county_census_MSA_full.csv')[['FIPS','POPESTIMATE2015',"STNAME","CTYNAME"]]
land_area = pd.read_excel('LND01.xls')
popu_den = popu.merge(land_area,left_on='FIPS',right_on='STCOU',how="inner")
popu_den['county'] = popu_den['STNAME'] + " " + popu_den['CTYNAME'].str.lower().str.split(pat=" ").str[0]
popu_den['popu_den'] = popu_den['POPESTIMATE2015'] / popu_den['Land2010']
popu_den_states = popu_den.groupby('STNAME')[['POPESTIMATE2015','Land2010']].sum()
popu_den_states['popu_den'] = popu_den_states['POPESTIMATE2015'] / popu_den_states['Land2010']
popu_den = popu_den[['county','STNAME','popu_den']]
popu_den

In [None]:
popu_den_states

In [None]:
from scipy.stats import spearmanr

# i_c = i_c.merge(popu_den,left_index=True,right_on='county',how="inner")
# d_c = d_c.merge(popu_den,left_index=True,right_on='county',how="inner")
for k,i in i_c.iteritems():
    print(k)
    print("Infections:",spearmanr(i.values,i_c['popu_den'].values,nan_policy='omit'))
    
for k,d in d_c.iteritems():
    print("Deaths",spearmanr(d.values,d_c['popu_den'].values,nan_policy='omit'))


In [None]:
i_s = i_s.merge(popu_den_states,left_index=True,right_index=True,how='inner')
d_s = d_s.merge(popu_den_states,left_index=True,right_index=True,how='inner')
for k,i in i_s.iteritems():
    print(k)
    print("Infections:",spearmanr(i.values,i_s['popu_den'].values,nan_policy='omit'))
    
for k,d in d_s.iteritems():
    print(k)
    print("Deaths",spearmanr(d.values,d_s['popu_den'].values,nan_policy='omit'))

In [None]:
# arrival time first surge only
# arrival_dates = (arrival_dates - pd.Timestamp('2020-01-01')).dt.days
# arrival_dates_s = (arrival_dates_s - pd.Timestamp('2020-01-01')).dt.days
# arrival_dates.name='arrival'
# arrival_dates_s.name='arrival'
i_c = i_c.merge(arrival_dates,left_index=True,right_index=True,how='inner')
d_c = d_c.merge(arrival_dates,left_index=True,right_index=True,how='inner')
i_s = i_s.merge(arrival_dates_s,left_index=True,right_index=True,how='inner')
d_s = d_s.merge(arrival_dates_s,left_index=True,right_index=True,how='inner')

print("County, Infections:",spearmanr(i_c['1st'].values,i_c['arrival'].values,nan_policy='omit'))
print("County, Deaths",spearmanr(d_c['1st'].values,d_c['arrival'].values,nan_policy='omit'))
print("State, Infections:",spearmanr(i_s['1st'].values,i_s['arrival'].values,nan_policy='omit'))
print("State, Deaths",spearmanr(d_s['1st'].values,d_s['arrival'].values,nan_policy='omit'))


# Duration of surges

In [None]:
i_s = pd.read_csv("exp_growth_rates_states_20210210.csv",index_col = 0)

with open("exp_growth_rates_nation_20210422.csv","rb") as f:
    i_n = pickle.load(f)
