In [192]:
# https://github.com/datadesk/california-coronavirus-data/blob/master/latimes-county-totals.csv
# motivated by this tweet: https://twitter.com/justin_hart/status/1255143233035755521
# where the author gets a really high R-squared

# also need
# 1. population by county
# 2. deaths by county

# then
# merge dataframes so that we have
# 1. deaths per 000 by county
# 2. nursing home cases per 000

import pandas as pd
import statsmodels.formula.api as sm

In [193]:
# nursing home cases by county

df = pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/cdph-nursing-homes.csv")
df['date'] = pd.to_datetime(df['date'])
df.drop(['fips', 'staff_note', 'patients_note', 'type', 'staff'], axis=1, inplace=True)
df['patients'].fillna(0, inplace=True)
#df['staff'].fillna(0, inplace=True)

In [194]:
df = df.set_index('date')
#df = df.set_index('date','county')

In [195]:
df = df.sort_values('date').groupby('name').tail(1)

In [196]:
nursing_cases = df.groupby('county').sum()

In [198]:
# deaths by county

df = pd.read_csv('https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-county-totals.csv')
df['date'] = pd.to_datetime(df['date'])
df.drop(['fips', 'confirmed_cases', 'deaths', 'new_confirmed_cases'], axis=1, inplace=True)
df = df.set_index('date')
df = df.groupby('county').sum()
df['deaths'] = df['new_deaths']
df.drop(['new_deaths'], axis=1, inplace=True)
deaths = df

In [197]:
# population by county

cp = pd.read_csv('calfornia population by county.csv')
cp['name'] = cp['CTYNAME'].str.replace(r' County', '')
cp.drop(['CTYNAME', 'GrowthRate'], axis=1, inplace=True)
county_populations = cp.set_index('name')

In [199]:
results = pd.concat([nursing_cases, county_populations, deaths], axis=1, sort=False)

In [200]:
results['nursing_patients_per_000000'] = results['patients']/results['Pop']*1000000
results['deaths_per_000000'] = results['deaths']/results['Pop']*1000000
results.head()

Unnamed: 0,patients,Pop,deaths,nursing_patients_per_000000,deaths_per_000000
Alameda,102.0,1666753,52.0,61.19683,31.198384
Amador,0.0,39383,0.0,0.0,0.0
Butte,0.0,231256,0.0,0.0,0.0
Calaveras,0.0,45602,0.0,0.0,0.0
Colusa,0.0,21627,0.0,0.0,0.0


In [201]:
result = sm.ols(formula="deaths_per_000000 ~ nursing_patients_per_000000", data=results).fit()
print(result.params)

Intercept                      12.853667
nursing_patients_per_000000     0.365047
dtype: float64


In [202]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:      deaths_per_000000   R-squared:                       0.478
Model:                            OLS   Adj. R-squared:                  0.467
Method:                 Least Squares   F-statistic:                     44.83
Date:                Tue, 28 Apr 2020   Prob (F-statistic):           1.97e-08
Time:                        17:31:46   Log-Likelihood:                -213.69
No. Observations:                  51   AIC:                             431.4
Df Residuals:                      49   BIC:                             435.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [203]:
# keep this for later -- it drops rows with zero nursing home patients
# which I may want to use
results2 = results[(results != 0).all(1)]
#results.fillna(0)
results2

Unnamed: 0,patients,Pop,deaths,nursing_patients_per_000000,deaths_per_000000
Alameda,102.0,1666753,52.0,61.19683,31.198384
Contra Costa,81.0,1150215,25.0,70.421617,21.735067
Los Angeles,850.0,10105518,948.0,84.112462,93.810134
Marin,1.0,259666,12.0,3.851101,46.213212
Orange,100.0,3185968,39.0,31.387635,12.241178
Riverside,224.0,2450758,141.0,91.400293,57.53322
Sacramento,27.0,1540975,41.0,17.521374,26.606532
San Bernardino,157.0,2171603,82.0,72.296824,37.760125
San Diego,58.0,3343364,113.0,17.347797,33.798294
San Francisco,68.0,883305,23.0,76.983601,26.038571


In [204]:
result2 = sm.ols(formula="deaths_per_000000 ~ nursing_patients_per_000000", data=results2).fit()
print(result2.params)

Intercept                      23.546896
nursing_patients_per_000000     0.263662
dtype: float64


In [205]:
print(result2.summary())

                            OLS Regression Results                            
Dep. Variable:      deaths_per_000000   R-squared:                       0.375
Model:                            OLS   Adj. R-squared:                  0.339
Method:                 Least Squares   F-statistic:                     10.22
Date:                Tue, 28 Apr 2020   Prob (F-statistic):            0.00528
Time:                        17:33:53   Log-Likelihood:                -81.782
No. Observations:                  19   AIC:                             167.6
Df Residuals:                      17   BIC:                             169.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [206]:
result3 = sm.ols(formula="nursing_patients_per_000000 ~ deaths_per_000000", data=results2).fit()
print(result3.params)

Intercept            1.108342
deaths_per_000000    1.423921
dtype: float64


In [207]:
print(result3.summary())

                                 OLS Regression Results                                
Dep. Variable:     nursing_patients_per_000000   R-squared:                       0.375
Model:                                     OLS   Adj. R-squared:                  0.339
Method:                          Least Squares   F-statistic:                     10.22
Date:                         Tue, 28 Apr 2020   Prob (F-statistic):            0.00528
Time:                                 17:35:36   Log-Likelihood:                -97.803
No. Observations:                           19   AIC:                             199.6
Df Residuals:                               17   BIC:                             201.5
Df Model:                                    1                                         
Covariance Type:                     nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------