# COVID Deaths by ZipCode in NYC

In [1]:
import numpy.random as npr
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings("ignore")

from myst_nb import glue # for the Jupyter book chapter

In [2]:
race_df=pd.read_csv('covid_data/data-6RFy5.csv').set_index('Zipcode')
income_df=pd.read_csv('covid_data/data-dAArg.csv').set_index('Zipcode')
asthma_df=pd.read_csv('covid_data/data-oMup5.csv').set_index('Zipcode')
pop_df=pd.read_csv('covid_data/data-rtgjl.csv').set_index('Zipcode')
age_df=pd.read_csv('covid_data/data-vS9Is.csv').set_index('Zipcode')
house_size_df = pd.read_csv('covid_data/data-yReq2.csv').set_index('Zipcode')

In [3]:
covid_df = race_df.join(income_df, rsuffix="DROP").filter(regex="^(?!.*DROP)")
covid_df = covid_df.join(asthma_df, rsuffix="DROP").filter(regex="^(?!.*DROP)")
covid_df = covid_df.join(pop_df, rsuffix="DROP").filter(regex="^(?!.*DROP)")
covid_df = covid_df.join(age_df, rsuffix="DROP").filter(regex="^(?!.*DROP)")
covid_df = covid_df.join(house_size_df, rsuffix="DROP").filter(regex="^(?!.*DROP)")
covid_df=covid_df.reset_index()

In [4]:
covid_df.columns =  ['zip','borough','neighborhood','deathper100k', 'whitepct', 'medianincome', 'asthmapct', 'popdens','medianage', 'housesize']

In [5]:
covid_df.to_csv('covid.csv')

In [46]:
covid_df.head()

Unnamed: 0,zip,borough,neighborhood,deathper100k,whitepct,medianincome,asthmapct,popdens,medianage,housesize
0,10001,Manhattan,Chelsea/NoMad/West Chelsea,94.16,62.91,88526,9.6,37291.65,37,1.76
1,10002,Manhattan,Chinatown/Lower East Side,252.25,32.23,35859,9.2,85336.07,44,2.18
2,10003,Manhattan,East Village/Gramercy/Greenwich Village,68.55,75.91,112131,10.1,94859.59,32,1.7
3,10004,Manhattan,Financial District,67.29,62.81,157645,9.2,5554.12,37,1.82
4,10005,Manhattan,Financial District,0.0,73.42,173333,9.4,121496.87,30,2.02


In [59]:
covid_df['whitepct_z'] = covid_df['whitepct'].transform(lambda x: (x-x.mean())/x.std())
covid_df['medianincome_z'] = covid_df['medianincome'].transform(lambda x: (x-x.mean())/x.std())
covid_df['asthmapct_z'] = covid_df['asthmapct'].transform(lambda x: (x-x.mean())/x.std())
covid_df['popdens_z'] = covid_df['popdens'].transform(lambda x: (x-x.mean())/x.std())
covid_df['medianage_z'] = covid_df['medianage'].transform(lambda x: (x-x.mean())/x.std())
covid_df['housesize_z'] = covid_df['housesize'].transform(lambda x: (x-x.mean())/x.std())

In [60]:
lr = smf.ols(formula="deathper100k ~ whitepct_z+medianincome_z+asthmapct_z+popdens_z+medianage_z+housesize_z", data=covid_df).fit()
print(lr.summary())
print(np.sqrt(lr.rsquared))

                            OLS Regression Results                            
Dep. Variable:           deathper100k   R-squared:                       0.450
Model:                            OLS   Adj. R-squared:                  0.430
Method:                 Least Squares   F-statistic:                     23.15
Date:                Fri, 29 Jan 2021   Prob (F-statistic):           7.05e-20
Time:                        00:21:49   Log-Likelihood:                -1048.7
No. Observations:                 177   AIC:                             2111.
Df Residuals:                     170   BIC:                             2134.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept        242.7485      6.944     34.

In [52]:
lr = smf.ols(formula="deathper100k ~ whitepct", data=covid_df).fit()
print(lr.summary())
print(np.sqrt(lr.rsquared))

                            OLS Regression Results                            
Dep. Variable:           deathper100k   R-squared:                       0.183
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     39.27
Date:                Fri, 29 Jan 2021   Prob (F-statistic):           2.78e-09
Time:                        00:09:34   Log-Likelihood:                -1083.6
No. Observations:                 177   AIC:                             2171.
Df Residuals:                     175   BIC:                             2178.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    336.5031     17.127     19.648      0.0

In [53]:
lr = smf.ols(formula="deathper100k ~ medianincome", data=covid_df).fit()
print(lr.summary())
print(np.sqrt(lr.rsquared))

                            OLS Regression Results                            
Dep. Variable:           deathper100k   R-squared:                       0.304
Model:                            OLS   Adj. R-squared:                  0.300
Method:                 Least Squares   F-statistic:                     76.37
Date:                Fri, 29 Jan 2021   Prob (F-statistic):           1.87e-15
Time:                        00:09:55   Log-Likelihood:                -1069.5
No. Observations:                 177   AIC:                             2143.
Df Residuals:                     175   BIC:                             2149.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      378.7742     17.364     21.813   

In [54]:
lr = smf.ols(formula="deathper100k ~ housesize", data=covid_df).fit()
print(lr.summary())
print(np.sqrt(lr.rsquared))

                            OLS Regression Results                            
Dep. Variable:           deathper100k   R-squared:                       0.123
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     24.61
Date:                Fri, 29 Jan 2021   Prob (F-statistic):           1.65e-06
Time:                        00:10:15   Log-Likelihood:                -1089.9
No. Observations:                 177   AIC:                             2184.
Df Residuals:                     175   BIC:                             2190.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     21.0393     45.517      0.462      0.6

In [None]:
lr = smf.ols(formula="deathper100k ~ housesize", data=covid_df).fit()
print(lr.summary())
print(np.sqrt(lr.rsquared))