In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta,datetime
import random
random.seed(3)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
import scipy.stats as ss
import json
import pickle
import warnings
warnings.filterwarnings("ignore")

In [None]:
# data source https://www.kaggle.com/headsortails/covid19-tracking-germany
# import data
df_raw = pd.read_csv("covid_Germany.csv")

# counties
df = df_raw.groupby(by=["state","county","date"])["cases"].sum()
counties = {}
arrival_counties = pd.Series()
for index, new_df in df.groupby(level=[0,1]):
    name = index[0] + " " + index[1].lower()
    new_df.reset_index(level=[0,1], drop=True, inplace=True)
    new_df = new_df.reset_index(level=0)
    new_df["date"] = pd.to_datetime(new_df["date"])
    new_df["diff_time"] = (new_df["date"] - pd.Timestamp("2020-01-01")).dt.days
    new_df.rename(columns={"cases":"di_cases"},inplace=True)
    new_df = new_df[new_df["date"] < pd.Timestamp("2020-06-01")]
    # arrival time
    infection_dates = new_df.loc[new_df["di_cases"]!=0,"diff_time"]
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates[0]
    else:
        arrival_date = 0
    arrival_counties[name] = arrival_date
    counties[name] = new_df

In [None]:
# states
df_states = df_raw.groupby(by=["state","date"])["cases"].sum()
states = {}
arrival_states = pd.Series()
for index, new_df in df_states.groupby(level=0):
    name = index
    new_df.reset_index(level=0, drop=True, inplace=True)
    new_df = new_df.reset_index(level=0)
    new_df["date"] = pd.to_datetime(new_df["date"])
    new_df["diff_time"] = (new_df["date"] - pd.Timestamp("2020-01-01")).dt.days
    new_df.rename(columns={"cases":"di_cases"},inplace=True)
    new_df = new_df[new_df["date"] < pd.Timestamp("2020-06-01")]
    # arrival time
    infection_dates = new_df.loc[new_df["di_cases"]!=0,"diff_time"]
    if len(infection_dates) > 0: # some neighborhoods never had infections up to 04/27
        arrival_date = infection_dates[0]
    else:
        arrival_date = 0
    arrival_states[name] = arrival_date
    states[name] = new_df

In [None]:
# nation
nation = df_raw.groupby(by=["date"])["cases"].sum()
nation = nation.reset_index(level=0)
nation["date"] = pd.to_datetime(nation["date"])
nation["diff_time"] = (nation["date"] - pd.Timestamp("2020-01-01")).dt.days
nation.rename(columns={"cases":"di_cases"},inplace=True)
nation = nation[nation["date"] < pd.Timestamp("2020-06-01")]

In [None]:
# visualize
plt.figure()
plt.title("nation")
plt.xticks(rotation=90)
plt.plot(nation["date"],nation["di_cases"])

In [None]:
# visualize
for k,v in counties.items():
    plt.figure()
    plt.title(k)
    plt.xticks(rotation=90)
    plt.plot(v["date"],v["di_cases"])

In [None]:
for k,v in states.items():
    plt.figure()
    plt.title(k)
    plt.xticks(rotation=90)
    plt.plot(v["date"],v["di_cases"])

In [None]:
def pseudoR2(model,mode = "CU"):
    # input: model: a GLM poisson or NB model
    #        mode: "CU" = Nagelkerke / Cragg & Uhler’s; "McFadden"; "ML" = Maximum Likelihood (Cox & Snell)
    L_f = model.llf # log-likelihood of full model
    L_i = model.llnull # log-likelihood of intercept
    N = model.nobs # number of data points
    
    r2McFadden = 1 - L_f/L_i
    G2 = -2 * (L_i - L_f)
    r2ML = 1 - np.exp(-G2/N)
    r2ML_max = 1 - np.exp(L_i * 2/N)
    r2CU = r2ML/r2ML_max
    if mode == "CU":
        r2 = r2CU
    if mode == "McFadden":
        r2 = r2McFadden
    if mode == "ML":
        r2 = r2ML
    if np.isnan(r2):
        r2 = 0
    return r2


In [None]:
def growthRate(data,data_type,var,mode,pR2_mode,poisson_chi2_cutoff,exp_cutoff = True):
    # input: data - dictionary of pd dataframe or a single pd dataframe
    #        data_type = "dict" or "pd_df"
    #        poisson_expr - str
    #        var - variable (column name in pd dataframe) to be regressed on. eg "di_cases","di_deaths","infections"
    #        mode = "Poisson" or "NB"
    #        pR2_mode = "CU" or "McFadden" or "ML"
    #        poisson_chi2_cutoff - int, if chi2 larger than this, discard this data
    
    # initiate statistics to be recorded
    stats = {}
    stats["outlier"] = 0
    stats["fail_poisson"] = 0
    stats["poisson_insig"] = 0
    stats["nb_insig"] = 0
    stats["high_poisson_chi2"] = 0
    stats["poisson_small_p"] = 0
    stats["nb_small_p"] = 0
    
    poisson_expr = var + """ ~ diff_time"""
    ols_expr = """aux_ols ~ y_lambda - 1"""
    
    if data_type == "pd_df":
        temp_data = data.copy()
        data = {}
        data[0] = temp_data
    
    growth_rates = [] 
    intercepts = []
    subregions_used = []
    pRsquared = []
    std_errs = []
    cut_offs = []
    tot_infected = []
    predictions = {}
    nobs=0
    
    for k,v in data.items():
        df = v.copy()
        df["diff_time"] = np.linspace(0,len(df)-1,len(df))
        Y,X = dmatrices(poisson_expr,df,return_type='dataframe')
        total_infect = np.sum(df[var])
        # exclude the outliers
        if len(df[df[var]!=0]) < 5 or total_infect < 30:
            stats["outlier"] += 1
            continue
        if exp_cutoff:
            # cut off upto exp ends using grid search, use the poisson model with best pseudo R^2 
            day_cnt = 0
            n_infected = 0
            pR2 = 0
            poisson = None
            cut_off = 0
            for i in range(len(df[var])):
                if df[var][i] != 0:
                    day_cnt += 1
                    n_infected += df[var][i]
                if day_cnt >= 5 and n_infected > 30:
                    temp = df[:i+1]
                    y,x = dmatrices(poisson_expr,temp,return_type='dataframe')
                    try:
                        temp_model = sm.GLM(y,x,family=sm.families.Poisson()).fit()
                    except:
                        stats["fail_poisson"] += 1
                        continue
                    temp_pR2 = pseudoR2(temp_model,mode=pR2_mode)
                    if temp_pR2 > pR2:
                        pR2 = temp_pR2
                        poisson = temp_model
                        cut_off = i
                        nobs=len(y)
        else:
            poisson = sm.GLM(Y,X,family=sm.families.Poisson()).fit()
            pR2 = pseudoR2(poisson,mode=pR2_mode)
            cut_off = len(df)
            nobs=len(Y)
        if pR2 == 0 or poisson == None or cut_off == 0:
            stats["poisson_insig"] += 1
            continue
        # check the significance of poisson regression
#         if poisson.pearson_chi2 > poisson_chi2_cutoff: # curves that don't follow exp trend (threshold 210 is inspected by eyes, not sure)
#             stats["high_poisson_chi2"] += 1
#             print(poisson.pearson_chi2)
#             continue
        if mode == "Poisson":
            if pR2 >= 0.3:
                final_model = poisson
                growth_rate = poisson.params["diff_time"]
                intercept = poisson.params["Intercept"]
                pRs = pR2
                std_err = poisson.bse["diff_time"]
            else:
                stats["poisson_small_p"] += 1
                continue
        else:
            # continue to Negative binomial
            df = df[:cut_off+1]
            y,x = dmatrices(poisson_expr,df,return_type='dataframe')
            # auxiliary OLS to fit the alpha in NB2
            df["y_lambda"] = poisson.mu
            df["aux_ols"] = df.apply(lambda x: ((x['di_cases'] - x['y_lambda'])**2 - x['di_cases']) / x['y_lambda'], axis=1)
            aux_olsr = smf.ols(ols_expr,df).fit()
            try:
                nb = sm.GLM(y,x,family=sm.families.NegativeBinomial(alpha=aux_olsr.params[0])).fit()
                nb_pR2 = pseudoR2(nb,mode=pR2_mode)
                if nb_pR2 >= 0:
                    final_model = nb
                    growth_rate = nb.params["diff_time"]
                    intercept = nb.params["Intercept"]
                    pRs = nb_pR2
                    std_err = nb.bse["diff_time"]
                else:
                    stats["nb_small_p"] += 1
                    continue
            except:
                stats["nb_insig"] += 1
                # use poisson result
                if pR2 >= 0.3:
                    final_model = poisson
                    growth_rate = poisson.params["diff_time"]
                    intercept = poisson.params["Intercept"]
                    pRs = pR2
                    std_err = poisson.bse["diff_time"]
                    nobs=len(y)
                else:
                    stats["poisson_small_p"] += 1
                    continue
            
        growth_rates.append(growth_rate)
        intercepts.append(intercept)
        subregions_used.append(k)
        pRsquared.append(pRs)
        std_errs.append(std_err)
        cut_offs.append(cut_off)
        tot_infected.append(total_infect)
        predictions[k] = final_model.predict(X)
        
    results = pd.DataFrame({"Growth Rate":growth_rates,"intercepts":intercepts,"pR2":pRsquared,"std_error":std_errs,"cut_offs":cut_offs,"tot_infected":tot_infected,'nobs':nobs},index=subregions_used)
    return [results,predictions,stats]

In [None]:
# county_state_nation growth rate distribution - infected
#c_i_results,c_i_pred,c_stats = growthRate(counties,"dict","di_cases","NB","McFadden",500)
#s_i_results,s_i_pred,s_stats = growthRate(states,"dict","di_cases","NB","McFadden",500)
n_i_results,n_i_pred,n_stats = growthRate(nation,"pd_df","di_cases","NB","McFadden",500)
#print("p_stats",p_stats)
#print("r_stats",r_stats)
#print("n_stats",n_stats)

In [None]:
n_i_results

In [None]:
# save
germany = {}
germany["c"] = c_i_results
germany["s"] = s_i_results
germany["n"] = n_i_results

with open("germany_first_surge_growth_rates.pickle","wb") as f:
    pickle.dump(germany,f)

In [None]:
c_i_results.loc[c_i_results["Growth Rate"]<0,"Growth Rate"] = 0

In [None]:
# Infected
# Figure 1(d)
fig = plt.figure(figsize=[6.5,6.5])

# plt.title("Growth Rates Daily Increase of Infected",fontname="Arial", fontsize=15)
# p_c = plt.hist(p_i_results["growth_rates"],color = "blue",alpha = 0.3,density=True,label = "county")
p_c = sns.distplot(c_i_results["Growth Rate"],hist=False, kde=True, 
             bins=40, color = "darkorange",
             kde_kws={'linewidth': 2,"shade": True},label = "German counties")
# p_s = plt.hist(r_i_results["growth_rates"],color="green",alpha = 0.3,density=True,label="state")
p_s = sns.distplot(s_i_results["Growth Rate"], hist=False,kde=True, 
             bins=40, color = "royalblue",
             kde_kws={'linewidth': 2,"shade": True},label = "German states")
p_n = plt.axvline(x=n_i_results["Growth Rate"][0],color="black",label="Nation")
# p_a_c = plt.axvline(x=county_agg_growth_rate,color="blue",label="county_agg")
# p_a_s = plt.axvline(x=state_agg_growth_rate,color="green",label="state_agg")
plt.legend(fontsize=20)
plt.xlabel("Infection Growth Rate",fontname="Arial", fontsize=25)
plt.ylabel("PDF",fontname="Arial", fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.tight_layout()


In [None]:
# Spearman Correlation
with open("germany_first_surge_growth_rates.pickle","rb") as f:
    germany = pickle.load(f)
    
c = germany["c"]
s = germany["s"]
n = germany["n"]

In [None]:
# corr with arrival time
print(spearmanr(c['Growth Rate'],arrival_counties[c.index].values,nan_policy='omit'))
print(spearmanr(s['Growth Rate'],arrival_states[s.index].values,nan_policy='omit'))

In [None]:
# population
popu = pd.read_excel('Germany_population.xlsx')
popu['name'] = popu['name'].str.lower()
popu_states = popu[popu['status']=='State']
popu_states['name'] = popu_states['name'].str.split("\xa0").str[0]
popu_counties = popu[popu['status']!='State']
popu_counties['name'] = popu_counties['name'].str.split(" ").str[0]
popu_counties['name'] = popu_counties['name'].str.split("\xa0").str[0]
import unidecode
popu_counties['name'] = popu_counties['name'].apply(unidecode.unidecode)

popu_counties

In [None]:
# land area
import geopandas as gpd

counties_map = gpd.read_file('Germany_covid/de_county.shp')
counties_area = counties_map['geometry'].area/ 10**6
counties_area.index = counties_map['GEN'].str.lower()
counties_area.name='area'

states_map = gpd.read_file('Germany_covid/de_state.shp')
states_area = states_map['geometry'].area/ 10**6
states_area.index = states_map['GEN'].str.lower()
states_area.name='area'

In [None]:
# now merge
popu_counties = popu_counties.merge(counties_area,left_on='name',right_index=True,how='inner')
popu_states = popu_states.merge(states_area,left_on='name',right_index=True,how='inner')
popu_counties['popu_den'] = popu_counties['population'] / popu_counties['area']
popu_states['popu_den'] = popu_states['population'] / popu_states['area']
popu_counties

In [None]:
# merge with c and s
c.index = c.index.str.split(' ').str[-1]
c = c.merge(popu_counties,left_index=True,right_on='name',how='inner')
from scipy.stats import spearmanr
print(spearmanr(c['Growth Rate'],c['popu_den'],nan_policy='omit'))

In [None]:
s.index = s.index.str.lower()
s = s.merge(popu_states,left_index=True,right_on='name',how='inner')
print(spearmanr(s['Growth Rate'],s['popu_den'],nan_policy='omit'))

In [None]:
arrival_states