Figure 2: LA neighborhood growth rate

In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid.inset_locator import (inset_axes, InsetPosition, mark_inset)
from datetime import timedelta,datetime
import random
random.seed(3)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Data import

In [None]:
# import data for all LA neighborhoods (223 in total) infections are cumulative
cum = {} # cumulative
di = {} # daily increase
cnt = 0
neighborhoods = []
for filename in os.listdir("LA_neighborhood_infections"):
    if re.match("^\.",filename):
        continue
    cnt += 1
    
    temp = pd.read_csv(os.path.join("LA_neighborhood_infections",filename))
    # set date as index
    temp["date"] = pd.to_datetime(temp["date"])
    temp = temp.set_index("date")
    # arrival date
    infection_dates = temp.index[temp["infections"]!=0]
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates[0]
    else:
        arrival_date = pd.to_datetime('2020-04-28')
    # calculate daily increase from cumulative
    infections = temp["infections"]
    infections_di = infections.diff()[1:] # infections_daily increase, returns a pd series
    # population and population density
    popu = temp["popu_tot"][0]
    area = temp["sqmi"] * 2.59 # convert sqmi to km^2
    popu_den = popu / area
    # total infected by the last date
    tot_infected = infections[-1]
    # store all files in a dictionary: files[name] = [data,arrival_date]
    name = filename.split('_')[0]
    di[name] = [infections_di,arrival_date,tot_infected,popu,popu_den]
    cum[name] = [infections,arrival_date,tot_infected,popu,popu_den]
    neighborhoods.append(name)
print(cnt)

## new data source

In [None]:
# new data source
la_raw = pd.read_csv('https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv')
la_raw
la = la_raw[la_raw['county']=='Los Angeles']
la.drop(["id",'county','fips','note','population'],axis=1,inplace=True)
la.loc[la['name'].str.contains('Long Beach'),'name'] = 'Long Beach'
la.loc[la['name'].str.contains('Pasadena'),'name'] = 'Pasadena'
la.rename(columns={'confirmed_cases':'infections'},inplace=True)
la

In [None]:
di = {} # daily increase
neighborhoods = []
for name in pd.unique(la['name']):
    temp = la[la['name']==name]
    # set date as index
    temp["date"] = pd.to_datetime(temp["date"])
    temp = temp[temp['date']<=pd.Timestamp("2020-04-29")]
    temp = temp.sort_values('date')
    temp = temp.set_index("date")
    # arrival date
    infection_dates = temp.index[temp["infections"]!=0]
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates[0]
    else:
        arrival_date = pd.to_datetime('2020-04-28')
    # calculate daily increase from cumulative
    infections = temp["infections"]
    infections_di = infections.diff()[1:] # infections_daily increase, returns a pd series
    
    di[name] = infections_di
    neighborhoods.append(name)

In [None]:
for k,v in di.items():
    fig = plt.figure()
    plt.title(k)
    plt.ylabel("Number of infected")
    plt.xticks(rotation=90)
    plt.plot(v[0])
    

In [None]:
# sum of all neighborhoods(equivalent to LA County?)
county_sum = np.zeros(shape=(len(cum["Acton"][0])))
county_sum = pd.Series(county_sum,index = cum["Acton"][0].index,name="infections")
for v in cum.values():
    infections = v[0]
    county_sum = county_sum.add(infections)
county_sum = county_sum.to_frame()
county_sum["date"] = county_sum.index
county_sum["start_date"] = datetime(2020,3,1)
county_sum["diff_time"] = ((county_sum["date"]-county_sum["start_date"])).dt.days

In [None]:
# import NYT county data for LA county (cases and deaths are cumulative)
LAC = pd.read_csv("county_state_data/county_cases_per_100k.csv")
LAC = LAC.loc[LAC["county"] == "los angeles"]
LAC = LAC[["diff_time","cases","deaths"]]
day1 = pd.Timestamp("2020-01-01")
LAC["diff_time"] = pd.to_timedelta(LAC["diff_time"],unit='days')
LAC["date"] = day1 + LAC["diff_time"]
LAC = LAC.set_index("date")
LAC = LAC["2020-03-01":]
LAC['diff_time'] = LAC['diff_time'].dt.days
LAC['di_cases'] = LAC['cases'].diff()
LAC = LAC[1:]
LAC

In [None]:
# plot number of infections vs date for all LA neighborhoods and LAC
fig1,ax1 = plt.subplots()
plt.xticks(rotation=90)
plt.yscale("log")
plt.plot(LAC)
plt.plot(county_sum["infections"])
# randomly select 10 neighborhoods to plot
lst = random.sample(list(cum),10)
for l in lst:
    plt.plot(cum[l][0])

# Growth rate estimation

In [None]:
# pseudo R squared
# https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/
def pseudoR2(model,mode = "CU"):
    # input: model: a GLM poisson or NB model
    #        mode: "CU" = Nagelkerke / Cragg & Uhler’s; "McFadden"; "ML" = Maximum Likelihood (Cox & Snell)
    L_f = model.llf # log-likelihood of full model
    L_i = model.llnull # log-likelihood of intercept
    N = model.nobs # number of data points
    
    r2McFadden = 1 - L_f/L_i
    G2 = -2 * (L_i - L_f)
    r2ML = 1 - np.exp(-G2/N)
    r2ML_max = 1 - np.exp(L_i * 2/N)
    r2CU = r2ML/r2ML_max
    if mode == "CU":
        r2 = r2CU
    if mode == "McFadden":
        r2 = r2McFadden
    if mode == "ML":
        r2 = r2ML
    if np.isnan(r2):
        r2 = 0
    return r2

In [None]:
# standard chi-squared table and t value table
chi_dist = pd.read_csv("chi_dist.csv",header = 0,index_col = 0)
t_dist = pd.read_csv("t_dist.csv",header = 0,index_col = 0)

Tutorial on Poisson & Negative binomial regression
https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

In [None]:
# growth rates of LA neighborhoods
poisson_insignificant = 0
olsr_insignificant = 0
nb_insignificant = 0
poisson_cnt = 0
nb_cnt = 0
non_exp = 0
outlier = 0
fail_poisson = 0
total = 0
neighborhoods_growth_rates = [] 
neighborhoods_used = []
neighborhoods_pRsquared = []
neighborhoods_std_errs = []
neighborhoods_tot_infected = []
poisson_expr = """infections ~ diff_time"""
ols_expr = """aux_ols ~ y_lambda - 1"""
for k,v in di.items():
    df = v
    df = df.to_frame()
    df["date"] = df.index
    df["start_date"] = datetime(2020,3,1)
    df["diff_time"] = ((df["date"]-df["start_date"])).dt.days
    df.loc[df["infections"]<0,"infections"] = 0
    # print(df)
    # take out outliers
    if len(df[df["infections"]!=0]) < 5 or sum(df["infections"]) < 30:
        outlier += 1
        print(len(df[df["infections"]!=0]),sum(df["infections"]))
        continue
    # cut off upto exp ends using grid search, use the poisson model with best pseudo R^2
    cnt = 0
    n_infected = 0
    pR2 = np.zeros(shape=len(df["infections"]))
    models = {} # space complexity needs to be improved, but for now keep these arrays for inspection
    for i in range(len(df["infections"])): # starting from the date with 5 data points and at least 30 infections
        if df["infections"][i] != 0:
            cnt += 1
            n_infected += df["infections"][i]
        if cnt >= 5 and n_infected > 30:
            temp = df[:i+1]
            y,x = dmatrices(poisson_expr,temp,return_type='dataframe')
            try:
                poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
                pR2[i] = pseudoR2(poisson)
                models[i] = poisson 
            except:
                fail_poisson += 1
                continue
    if np.max(pR2) == 0:
        poisson_insignificant += 1
        continue
    best_poisson_idx = np.argmax(pR2)
    poisson = models[best_poisson_idx]
    poisson_chi2 = poisson.pearson_chi2
    # std_chi = chi_dist.loc[poisson.df_resid,"0.95"]
    if poisson_chi2 > 210: # curves that don't follow exp trend (threshold 210 is inspected by eyes, not sure)
        non_exp += 1
        # print(k,", poisson chi2=",poisson_chi2,", std chi2=",std_chi)
        continue
    df = df[:best_poisson_idx+1]
    y,x = dmatrices(poisson_expr,df,return_type='dataframe')
    total_infected_upto_cutoff = sum(df["infections"])
    # auxiliary OLS to fit the alpha in NB2
    df["y_lambda"] = poisson.mu
    df["aux_ols"] = df.apply(lambda x: ((x['infections'] - x['y_lambda'])**2 - x['infections']) / x['y_lambda'], axis=1)
    aux_olsr = smf.ols(ols_expr,df).fit()
    std_tvalue = t_dist.loc[aux_olsr.nobs-1,"0.95"]
    if aux_olsr.tvalues[0] < std_tvalue: # t value too small, not significant
        # if standard t value for our dof and sig_level=0.05 is larger than our t,  poisson will just do fine, and negative binomial is not necessary
        olsr_insignificant += 1
    # still proceed to NB just to see how it works
    try:
        nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
    except:
        nb_insignificant += 1
        # use poisson regression for estimated growth rate
        if pseudoR2(poisson) >= 0.3:
            growth_rate = poisson.params["diff_time"]
            neighborhoods_growth_rates.append(growth_rate)
            neighborhoods_used.append(k)
            neighborhoods_pRsquared.append(pseudoR2(poisson))
            neighborhoods_std_errs.append(poisson.bse["diff_time"])
            poisson_cnt += 1
            neighborhoods_tot_infected.append(total_infected_upto_cutoff)
        continue
    nb_chi2 = nb.pearson_chi2
    # if nb_chi2 > std_chi:
        # print(k,", nb chi2=",nb_chi2,", std chi2=",std_chi)
    if pseudoR2(nb) >= 0.3:
        growth_rate = nb.params["diff_time"]
        neighborhoods_growth_rates.append(growth_rate)
        neighborhoods_used.append(k)
        neighborhoods_pRsquared.append(pseudoR2(nb))
        neighborhoods_std_errs.append(nb.bse["diff_time"])
        nb_cnt += 1
        neighborhoods_tot_infected.append(total_infected_upto_cutoff)
print(poisson_insignificant,olsr_insignificant,nb_insignificant)
print(len(neighborhoods_used))
print(poisson_cnt,nb_cnt)
print(outlier,fail_poisson,non_exp)

In [None]:
# save
neighborhoods_gr = pd.DataFrame({"gr":neighborhoods_growth_rates,"std_err":neighborhoods_std_errs},index=neighborhoods_used)
neighborhoods_gr.to_csv('LAC_nbhds_growth_rate_estimates_updated.csv')

In [None]:
neighborhoods_gr

In [None]:
# calculate Spearman corr
for city,row in neighborhoods_gr.iterrows():
    try:
        neighborhoods_gr.loc[city,'popu_den'] = di[city][4][0]
        neighborhoods_gr.loc[city,'arrival'] = di[city][1]
    except:
        continue
# arrival: date to diff time
neighborhoods_gr['arrival'] = (neighborhoods_gr['arrival'] - pd.Timestamp('2020-01-01')).dt.days
neighborhoods_gr

In [None]:
from scipy.stats import spearmanr
print(spearmanr(neighborhoods_gr['gr'],neighborhoods_gr['popu_den'],nan_policy='omit'))
print(spearmanr(neighborhoods_gr['gr'],neighborhoods_gr['arrival'],nan_policy='omit'))

In [None]:
plt.hist(neighborhoods_pRsquared)
plt.title("Histogram: pseudo R^2 for NB regression on daily increase data of LA neighborhoods ")

In [None]:
# scatter plot of NB fits
plt.figure(figsize = [6.4*1.5, 4.8*1.5])
plt.xlim((20,80))
plt.errorbar(neighborhoods_tot_infected, neighborhoods_growth_rates,yerr=neighborhoods_std_errs,fmt = "b.")
plt.xlabel("Number of infected")
plt.ylabel("NB fitted growth rate")

In [None]:
# agg growth rate of neighborhoods used (county_di)
county_di = np.zeros(shape=(len(di["Acton"][0])))
county_di = pd.Series(county_di,index = di["Acton"][0].index,name="infections")
for n in neighborhoods:
    infections = di[n][0]
    county_di = county_di.add(infections)
county_di = county_di.to_frame()
county_di["date"] = county_di.index
county_di["start_date"] = datetime(2020,3,1)
county_di["diff_time"] = ((county_di["date"]-county_di["start_date"])).dt.days
fig = plt.figure()
plt.plot(county_di["infections"])
plt.title("Daily Increase of Infected (Aggregation over LA neighborhoods)")
# print(county_di)

poisson_expr = """infections ~ diff_time"""
ols_expr = """aux_ols ~ y_lambda - 1"""
cnt = 0
county_di_pR2 = np.zeros(shape=len(county_di["infections"]))
models = {}
for i in range(len(county_di["infections"])):
    if county_di["infections"][i] != 0:
        cnt += 1
    if cnt >= 5:
        temp = county_di[:i+1]
        y,x = dmatrices(poisson_expr,temp,return_type='dataframe')
        try:
            poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
            # print(poisson.llf,poisson.llnull,poisson.nobs)
            county_di_pR2[i] = pseudoR2(poisson,mode="McFadden")
            models[i] = poisson
        except:
            continue
if np.max(county_di_pR2) == 0:
    print("insignificant poisson")
# print(county_di_pR2)
best_poisson_idx = np.argmax(pR2)
poisson = models[best_poisson_idx]
poisson_chi2 = poisson.pearson_chi2
std_chi = chi_dist.loc[poisson.df_resid,"0.95"]
print("poisson chi2=",poisson_chi2,", std chi2=",std_chi)
county_di = county_di[:best_poisson_idx+1]
y,x = dmatrices(poisson_expr,county_di,return_type='dataframe')
# auxiliary OLS to fit alpha
county_di["y_lambda"] = poisson.mu
county_di["aux_ols"] = county_di.apply(lambda x: ((x['infections'] - x['y_lambda'])**2 - x['infections']) / x['y_lambda'], axis=1)
aux_olsr = smf.ols(ols_expr,county_di).fit()
# print("alpha = ",aux_olsr.params)
std_tvalue = t_dist.loc[aux_olsr.nobs-1,"0.95"]
print("t_value of alpha = ",aux_olsr.tvalues," , std t value=",std_tvalue)
nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
nb_chi2 = nb.pearson_chi2
print("nb chi2=",nb_chi2,", std chi2=",std_chi)
print(poisson.params["diff_time"],nb.params["diff_time"])
agg_neighborhoods_growth_rate = nb.params["diff_time"]
agg_neighborhoods_std_error = nb.bse['diff_time']

In [None]:
# aggregated, using LAC
poisson_expr = """di_cases ~ diff_time"""
ols_expr = """aux_ols ~ y_lambda - 1"""
cnt = 0
LAC_pR2 = np.zeros(shape=len(LAC))
models = {}
for i in range(len(LAC['di_cases'])):
    if LAC['di_cases'][i] != 0:
        cnt += 1
    if cnt >= 5:
        temp = LAC[:i+1]
        y,x = dmatrices(poisson_expr,temp,return_type='dataframe')
        try:
            poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
            # print(poisson.llf,poisson.llnull,poisson.nobs)
            LAC_pR2[i] = pseudoR2(poisson,mode="McFadden")
            models[i] = poisson
        except:
            continue
if np.max(LAC_pR2) == 0:
    print("insignificant poisson")
# print(county_di_pR2)
best_poisson_idx = np.argmax(LAC_pR2)
poisson = models[best_poisson_idx]
poisson_chi2 = poisson.pearson_chi2
std_chi = chi_dist.loc[poisson.df_resid,"0.95"]
print("poisson chi2=",poisson_chi2,", std chi2=",std_chi)
LAC = LAC[:best_poisson_idx+1]
y,x = dmatrices(poisson_expr,LAC,return_type='dataframe')
# auxiliary OLS to fit alpha
LAC["y_lambda"] = poisson.mu
LAC["aux_ols"] = LAC.apply(lambda x: ((x['di_cases'] - x['y_lambda'])**2 - x['di_cases']) / x['y_lambda'], axis=1)
aux_olsr = smf.ols(ols_expr,LAC).fit()
# print("alpha = ",aux_olsr.params)
std_tvalue = t_dist.loc[aux_olsr.nobs-1,"0.95"]
print("t_value of alpha = ",aux_olsr.tvalues," , std t value=",std_tvalue)
nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
nb_chi2 = nb.pearson_chi2
print("nb chi2=",nb_chi2,", std chi2=",std_chi)
print(poisson.params["diff_time"],nb.params["diff_time"])
agg_neighborhoods_growth_rate = nb.params["diff_time"]
agg_neighborhoods_std_error = nb.bse['diff_time']
agg_neighborhoods_nobs = len(y)

In [None]:
nbhds_agg = pd.DataFrame()
nbhds_agg.loc[0,'gr'] = agg_neighborhoods_growth_rate
nbhds_agg.loc[0,'std_err'] = agg_neighborhoods_std_error
nbhds_agg.loc[0,'nobs'] = agg_neighborhoods_nobs
nbhds_agg.to_csv('LAC_nbhds_agg_growth_rate.csv')

In [None]:
len(neighborhoods_gr)

In [None]:
# Figure 2(b)
fig = plt.figure(figsize=[6.5,6.5])
# plt.xlim((-0.1,0.9))

plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.title("Growth rates of Daily Increase of Infected for LA Neighborhoods",fontname="Arial", fontsize=15)
# p = plt.hist(neighborhoods_growth_rates,80,label = "neighborhoods")
sns.distplot(neighborhoods_growth_rates, hist=False, kde=True, 
             bins=40, norm_hist = True, color = "red",
             kde_kws={'linewidth': 2,"shade":True },label = "LA neighborhoods")
plt.axvline(x=agg_neighborhoods_growth_rate,color="black",label = "LA County")
plt.legend(fontsize=20)
plt.xlabel("Infection Growth Rate",fontname="Arial", fontsize=25)
plt.ylabel("PDF",fontname="Arial", fontsize=25)
plt.tight_layout()


# Simulations

In [None]:
def simInitiation(n_days,column_name):
    # start_date: str
    dates = pd.date_range(start="2020-03-01",periods = n_days,freq="D")
    sim_init = np.zeros(shape=(n_days))
    sim_init = pd.Series(sim_init,index = dates,name=column_name)
    sim_init = sim_init.to_frame()
    sim_init["date"] = sim_init.index
    sim_init["start_date"] = datetime(2020,3,1)
    sim_init["diff_time"] = ((sim_init["date"]-sim_init["start_date"])).dt.days
    return sim_init

In [None]:
# Simulation 1
# simulte counties with uniform growth rate and actual arrival time
n_neighborhoods = len(neighborhoods_used)
n_days = 70
t = np.array(range(n_days))
sims1 = {}
poisson_expr = """infections ~ diff_time"""
ols_expr = """aux_ols ~ y_lambda - 1"""
for c in neighborhoods_used:
    arrival_date = di[c][1]
    tot_days_bf_arrival = int((arrival_date - pd.Timestamp("2020-03-01"))/np.timedelta64(1,"D"))
    y = np.rint(np.exp(0.1*(t-tot_days_bf_arrival)))
    sim_di = simInitiation(n_days,"infections")
    sim_di["infections"] = y
    sims1[c] = sim_di
# total di of these neighborhoods
sim1_agg = simInitiation(n_days,"infections")
for v in sims1.values():
    sim1_agg["infections"] = sim1_agg["infections"].add(v["infections"])

# growth rate for sims1 data
fail_poisson = 0
fail_nb = 0
sim1_growth_rates = []
for k,v in sims1.items():
    df = v
    # poisson
    y,x = dmatrices(poisson_expr,df,return_type='dataframe')
    try:
        poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
    except:
        fail_poisson += 1
        continue
    sim1_growth_rate = poisson.params["diff_time"]
    sim1_growth_rates.append(sim1_growth_rate)

# growth rate of sim1_agg
y,x = dmatrices(poisson_expr,sim1_agg,return_type='dataframe')
try:
    poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
except:
    print("Poisson Error")
# sim1_agg["y_lambda"] = poisson.mu
# sim1_agg["aux_ols"] = sim1_agg.apply(lambda x: ((x['infections'] - x['y_lambda'])**2 - x['infections']) / x['y_lambda'], axis=1)
# aux_olsr = smf.ols(ols_expr,sim1_agg).fit()
#  nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
sim1_agg_growth_rate = poisson.params["diff_time"]
sim1_agg_std_err = poisson.bse['diff_time']
sim1_agg_nobs = len(y)
print(sim1_agg_growth_rate)

# plot sim data - number of infections vs date for all LA neighborhoods and LAC
fig, ax1 = plt.subplots(figsize=[6.5,6.5])
plt.yscale("log")
plt.ylabel("Number Infections",fontname="Arial", fontsize=25)
plt.xlabel("Timestep",fontname="Arial", fontsize=25)
# plt.title("Simulated Daily Increase of Infected",fontname="Arial", fontsize=15)
plt.plot(sim1_agg["infections"],color="black")
# randomly select 10 neighborhoods to plot
lst = random.sample(list(sims1),10)
for c in lst:
    plt.plot(sims1[c]["infections"],linestyle="--")
plt.xticks(ticks = sim1_agg.index[::10],labels=["%2d" %i for i in range(0,n_days,10)],fontsize=25)
plt.yticks(fontsize=25)
plt.tight_layout()
    
# Figure 2(c) inset
ax2 = plt.axes([-1,-1,0.5,0.5])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.15,0.63,0.35,0.35])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 and draw lines
# in grey linking the two axes.
# mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.5')


#plt.xlabel("Growth Rate",fontname="Arial", fontsize=12)
#plt.ylabel("Frequency",fontname="Arial", fontsize=12)
#plt.title("Growth rates of simulated daily increase",fontname="Arial", fontsize=15)
# p = plt.hist(sim1_growth_rates,80)
sns.distplot(sim1_growth_rates, hist=False, kde=True, 
             bins=40, norm_hist = True, color = "red",
             kde_kws={'linewidth': 2,"shade":True },label = "Sim. Nbhds.")
ax2.axvline(x=sim1_agg_growth_rate,color="black",label="Sim. County")
ax2.axvline(x=np.mean(sim1_growth_rates),linestyle = "-.",color="black",label="Mean of Sim. Nbhds.")
ax2.axvline(x=0.1,color="black",label="True Growth Rate",linestyle="--")
plt.xticks(fontsize=20,ticks=[0.1,0.11],labels=["0.1","0.11"])
plt.yticks(fontsize=22)
fm = plt.legend(fontsize=14,bbox_to_anchor=(0.3, 0., 1.1, 1.04),facecolor='white', framealpha=1)
fm.get_frame().set_linewidth(0.0)
# fm.set_facecolor("white")


In [None]:
sim1_growth_rates = pd.DataFrame(sim1_growth_rates)
sim1_growth_rates.to_csv('LAC_sim_varying_arrival_growth_rates.csv')
sim1_agg = pd.DataFrame()
sim1_agg.loc[0,'gr_est'] = sim1_agg_growth_rate
sim1_agg.loc[0,'std_err'] = sim1_agg_std_err
sim1_agg.loc[0,"nobs"] = sim1_agg_nobs
sim1_agg.loc[0,'true_gr'] = 0.1
sim1_agg.to_csv('LAC_sim_varying_arrival_agg.csv')

In [None]:
# Simulation 1.1: same growth rates and same arrival time
n_neighborhoods = len(neighborhoods_used)
n_days = 70
t = np.array(range(n_days))
sims11 = {}
poisson_expr = """infections ~ diff_time"""
ols_expr = """aux_ols ~ y_lambda - 1"""
for c in neighborhoods_used:
    y = np.rint(np.exp(0.1*(t-1)))
    sim_di = simInitiation(n_days,"infections")
    sim_di["infections"] = y
    sims11[c] = sim_di
# total di of these neighborhoods
sim11_agg = simInitiation(n_days,"infections")
for v in sims11.values():
    sim11_agg["infections"] = sim11_agg["infections"].add(v["infections"])
    
# plot sim data - number of infections vs date for all LA neighborhoods and LAC
plt.figure()
plt.xticks(rotation=90)
plt.yscale("log")
plt.ylabel("number of infected")
plt.title("Simulated Daily Increase of Infected")
plt.plot(sim11_agg["infections"],color="black")
# randomly select 10 neighborhoods to plot
lst = random.sample(list(sims11),10)
for c in lst:
    plt.plot(sims11[c]["infections"])

# growth rate of sim1_agg
y,x = dmatrices(poisson_expr,sim11_agg,return_type='dataframe')
try:
    poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
except:
    print("Poisson Error")
# sim1_agg["y_lambda"] = poisson.mu
# sim1_agg["aux_ols"] = sim1_agg.apply(lambda x: ((x['infections'] - x['y_lambda'])**2 - x['infections']) / x['y_lambda'], axis=1)
# aux_olsr = smf.ols(ols_expr,sim1_agg).fit()
#  nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
sim1_agg_growth_rate = poisson.params["diff_time"]
print(sim1_agg_growth_rate)

# # Figure 2(c) inset
# fig = plt.figure()
# plt.xlabel("growth rate")
# plt.ylabel("frequency")
# plt.title("Growth rates of simulated daily increase")
# p = plt.hist(sim1_growth_rates,80)
# l1 = plt.axvline(x=sim1_agg_growth_rate,color="red")
# l2 = plt.axvline(x=np.mean(sim1_growth_rates),color="green")
# l3 = plt.axvline(x=0.1,color="orange")
# plt.legend((l1, l2, l3), ('sim_agg', 'mean_of_sim_neighborhoods', 'true'))

In [None]:
# Simulation 1.2: poisson-estimated growth rate vs length of time series
sim12_growth_rates = []
for n_days in range(50,500,10):
    t = np.array(range(n_days))
    sim12 = simInitiation(n_days,"infections")
    y = np.rint(np.exp(0.1*(t)))
    sim12["infections"] = y
    y,x = dmatrices(poisson_expr,sim12,return_type='dataframe')
    try:
        poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
    except:
        fail_poisson += 1
        continue
    growth_rate = poisson.params["diff_time"]
    sim12_growth_rates.append(growth_rate)
plt.figure()
plt.plot(list(range(50,500,10)),sim12_growth_rates)
plt.ylabel("Poisson-estimated growth rate")
plt.xlabel("# of days in time series")

In [None]:
# Simulation2: same arrival time, different growth rate
n_days = 60
t = np.array(range(n_days))
sims2 = {}
sim2_agg = simInitiation(n_days,"infections")
for i in range(len(neighborhoods_used)):
    sim2 = simInitiation(n_days,"infections")
    sim_growth_rate = neighborhoods_growth_rates[i]
    y = np.rint(np.exp(sim_growth_rate*t))
    sim2["infections"] = y
    sims2[i] = sim2
    sim2_agg["infections"] += y

plt.figure()
plt.title("Simulated Daily Increase of Infected")
plt.ylabel("number of infected")
plt.xticks(rotation=90)
plt.yscale("log")
plt.plot(sim2_agg["infections"])
lst = random.sample(list(sims2),10)
for l in lst:
    plt.plot(sims2[l]["infections"])

# growth rate for sim data (should be same as growth_rates)
sim2_growth_rates = []
cnt = 0
for k,v in sims2.items():
    df = v
    # poisson
    expr = """infections ~ diff_time"""
    y,x = dmatrices(expr,df,return_type='dataframe')
    poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
    growth_rate = poisson.params["diff_time"]
    sim2_growth_rates.append(growth_rate)

plt.figure()
plt.plot(sim2_growth_rates,color="red")
plt.plot(neighborhoods_growth_rates,color="green")
plt.xlabel("days")
plt.ylabel("sim_growth_rate")

# growth rate of sim2_agg
y,x = dmatrices(poisson_expr,sim2_agg,return_type='dataframe')
try:
    poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
except:
    print("Poisson Error")
sim2_agg_growth_rate = poisson.params["diff_time"]
sim2_agg_std_err = poisson.bse['diff_time']
sim2_agg_nobs = len(y)
print(sim2_agg_growth_rate)

# Figure control arrival date, vary growth rates
plt.figure()
# plt.xlim((0.1,0.35))
plt.xlabel("growth rate")
plt.ylabel("frequency")
plt.title("Growth rates of simulated daily increase")
p = plt.hist(sim2_growth_rates,50,label="sim_neighborhoods")
plt.axvline(x=sim2_agg_growth_rate,color="red",label="aggregated")
plt.legend()

# sns distplot
fig = plt.figure(figsize=[6.5,6.5])
plt.xlim((-0.1,0.7))

plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.title("Growth rates of Daily Increase of Infected for LA Neighborhoods",fontname="Arial", fontsize=15)
# p = plt.hist(neighborhoods_growth_rates,80,label = "neighborhoods")
sns.distplot(sim2_growth_rates, hist=False, kde=True, 
             bins=40, norm_hist = True, color = "red",
             kde_kws={'linewidth': 2,"shade":True },label = "LA neighborhoods")
plt.axvline(x=sim2_agg_growth_rate,color="black",label = "County")
# plt.legend(fontsize=20)
plt.xlabel("Simulated Growth Rate",fontname="Arial", fontsize=25)
plt.ylabel("",fontname="Arial", fontsize=25)
plt.tight_layout()

In [None]:
sim2_growth_rates = pd.DataFrame(sim2_growth_rates)
sim2_growth_rates.to_csv('LAC_sim_varying_growthrates.csv')
sim2_agg = pd.DataFrame()
sim2_agg.loc[0,'gr_est'] = sim2_agg_growth_rate
sim2_agg.loc[0,'std_err'] = sim2_agg_std_err
sim2_agg.loc[0,'nobs'] = sim2_agg_nobs
sim2_agg.to_csv('LAC_sim_varying_growthrates_agg.csv')

In [None]:
# Simultion 3: counties with actual growth rate and actual arrival time
n_days=70
t = np.array(range(n_days))
sims3 = {}
sim3_agg = simInitiation(n_days,"infections")
for i in range(len(neighborhoods_used)):
    c = neighborhoods_used[i]
    arrival_date = di[c][1]
    sim_growth_rate = neighborhoods_growth_rates[i]
    tot_days_bf_arrival = int((arrival_date - pd.Timestamp("2020-03-01"))/np.timedelta64(1,"D"))
    y = np.rint(np.exp(sim_growth_rate*(t-tot_days_bf_arrival)))
    sim_di = simInitiation(n_days,"infections")
    sim_di["infections"] = y
    sims3[c] = sim_di
    sim3_agg["infections"] = sim3_agg["infections"].add(y)
    
# plot sim data - number of infections vs date for all LA neighborhoods and LAC
plt.figure()
plt.xticks(rotation=90)
plt.yscale("log")
plt.title("Simulated Daily Increase of Infected with actual LA neighborhoods growth rates and arrival dates")
plt.plot(sim3_agg["infections"],color="black")
# randomly select 10 neighborhoods to plot
lst = random.sample(list(sims3),10)
for c in neighborhoods_used:
    plt.plot(sims3[c]["infections"])

# growth rate for sims3 data
fail_poisson = 0
fail_nb = 0
sim3_growth_rates = []
for k,v in sims3.items():
    df = v
    # poisson
    y,x = dmatrices(poisson_expr,df,return_type='dataframe')
    try:
        poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
    except:
        fail_poisson += 1
        continue
    sim3_growth_rate = poisson.params["diff_time"]
    sim3_growth_rates.append(sim3_growth_rate)

# growth rate of sim1_agg
y,x = dmatrices(poisson_expr,sim3_agg,return_type='dataframe')
try:
    poisson = sm.GLM(y,x,family=sm.families.Poisson()).fit()
except:
    print("Poisson Error")
sim3_agg_growth_rate = poisson.params["diff_time"]
print(sim3_agg_growth_rate)

# Figure
fig = plt.figure(figsize=[6.4*1.5, 4.8*1.5])
plt.xlabel("growth rate")
plt.ylabel("frequency")
plt.title("Poisson regression on simulated daily increase")
p = plt.hist(sim3_growth_rates,80,label = "sim_neighborhoods")
l1 = plt.axvline(x=sim3_agg_growth_rate,color="red",label = "aggregated")
#l2 = plt.axvline(x=np.mean(sim3_growth_rates),color="green")
plt.legend()

# growth rates comparison
plt.figure()
plt.plot(sim3_growth_rates,color="red")
plt.plot(neighborhoods_growth_rates,color="green")

In [None]:
# Spearman's Rank correlation
LA_matrix = pd.DataFrame({"Growth Rate":neighborhoods_growth_rates,"Population":0,"Population density":0,"Days since first case":pd.Timestamp("2020-04-28"),"Total infected":0},index=neighborhoods_used)
for n in neighborhoods_used:
    LA_matrix.loc[n,"Population"] = di[n][3]
    LA_matrix.loc[n,"Population density"] = di[n][4]
    LA_matrix.loc[n,"Days since first case"] = (di[n][1]-pd.Timestamp("2020-03-01")) / timedelta(days=1)
    LA_matrix.loc[n,"Total infected"] = di[n][2]
    if n == "Vernon":
        LA_matrix.loc[n,"Population"] = 112
        LA_matrix.loc[n,"Population density"] = 8.54
print(LA_matrix)

LA_corr,LA_pval = ss.spearmanr(LA_matrix)
LA_corr = LA_corr[0,1:]
LA_pval = LA_pval[0,1:]
