In [1]:
import pandas as pd
import os

## Count hospitalized patients in each hospital per week

In [2]:
hospitalizations_raw = pd.read_pickle('../../pickles/CDCpickles/hospitalizations_raw.pkl')
non_continental_states = ['AK', 'HI', 'MP', 'GU', 'VI', 'PR', 'AS']
hospitalizations_raw.head()

Unnamed: 0,hospital_pk,collection_week,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_pediatric_covid_confirmed_7_day_coverage,previous_day_admission_adult_covid_suspected_7_day_coverage,previous_day_admission_pediatric_covid_suspected_7_day_coverage,previous_week_personnel_covid_vaccinated_doses_administered_7_day,total_personnel_covid_vaccinated_doses_none_7_day,total_personnel_covid_vaccinated_doses_one_7_day,total_personnel_covid_vaccinated_doses_all_7_day,previous_week_patients_covid_vaccinated_doses_one_7_day,previous_week_patients_covid_vaccinated_doses_all_7_day,is_corrected
0,aab2bb3ab769da90baf57242c96ec481afb5ec6a233784...,2021/07/23,LA,,Crescent City Surgical Centre,,,70118.0,Short Term,,...,7,7,7,0.0,64.0,0.0,99.0,0.0,0.0,False
1,ee04edd185865c38c839812cb2eb5ae5d3f8922e3b629e...,2021/07/23,LA,,Alexandria Emergency Hospital,5900 Coliseum Blvd,Alexandria,71303.0,Short Term,,...,7,7,7,0.0,37.0,26.0,26.0,0.0,0.0,False
2,f70d7abf93c78280583ac18e896e6737cca8212b017513...,2021/07/23,NV,,Elite Medical Center,150 E Harmon Ave,Las Vegas,89109.0,Short Term,,...,7,7,7,,,,,,,False
3,3b081d5ef1c552538e4af4aa593a857bb922a4f364a412...,2021/07/23,LA,,Surgery Center of Zachary,,,70791.0,Short Term,,...,7,7,7,0.0,0.0,0.0,0.0,0.0,0.0,False
4,010108,2021/07/23,AL,10108.0,PRATTVILLE BAPTIST HOSPITAL,124 S MEMORIAL DR,PRATTVILLE,36067.0,Short Term,1001.0,...,7,7,7,0.0,221.0,0.0,186.0,0.0,0.0,False


In [3]:
# If my understanding is correct, the following 4 admission columns total the new hospitalized patients
# for that hospital (hospital_pk) for that week (collection_week)
admission_columns = ['previous_day_admission_adult_covid_confirmed_7_day_sum', 
                 'previous_day_admission_pediatric_covid_confirmed_7_day_sum', 
                 'previous_day_admission_adult_covid_suspected_7_day_sum', 
                 'previous_day_admission_pediatric_covid_suspected_7_day_sum']

In [4]:
hospitalizations_raw = hospitalizations_raw[~hospitalizations_raw['state'].isin(non_continental_states)]
hospitalizations = hospitalizations_raw[['hospital_pk', 'collection_week'] + admission_columns]

In [5]:
hospitalizations.head()

Unnamed: 0,hospital_pk,collection_week,previous_day_admission_adult_covid_confirmed_7_day_sum,previous_day_admission_pediatric_covid_confirmed_7_day_sum,previous_day_admission_adult_covid_suspected_7_day_sum,previous_day_admission_pediatric_covid_suspected_7_day_sum
0,aab2bb3ab769da90baf57242c96ec481afb5ec6a233784...,2021/07/23,0.0,0.0,0.0,0.0
1,ee04edd185865c38c839812cb2eb5ae5d3f8922e3b629e...,2021/07/23,0.0,0.0,0.0,0.0
2,f70d7abf93c78280583ac18e896e6737cca8212b017513...,2021/07/23,0.0,0.0,0.0,0.0
3,3b081d5ef1c552538e4af4aa593a857bb922a4f364a412...,2021/07/23,0.0,0.0,0.0,0.0
4,010108,2021/07/23,28.0,-999999.0,19.0,-999999.0


In [6]:
# Apparently values in '_sum' columns between 0 and 4 are replaced by -999999.0 (so they can be identified and tossed?)...
# I think the avg (i.e., 2.0) is better.
hospitalizations[admission_columns] = hospitalizations[admission_columns].replace(-999999.0, 2.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [7]:
# Ah, let's replace NaN's in admission columns with 0.0, just so we don't loose the whole row of cases
hospitalizations = hospitalizations.fillna(0)

# Then add up the admission columns to get a patient count total for that row
hospitalizations['patients'] = hospitalizations[admission_columns].sum(axis=1)
hospitalizations = hospitalizations.drop(admission_columns, axis=1)
hospitalizations['collection_week'] = pd.to_datetime(hospitalizations['collection_week'], format='%Y/%m/%d').dt.to_period('W-THU')
hospitalizations = hospitalizations.sort_values(['hospital_pk', 'collection_week'])
hospitalizations

Unnamed: 0,hospital_pk,collection_week,patients
254671,010001,2020-07-31/2020-08-06,26.0
249774,010001,2020-08-07/2020-08-13,46.0
244925,010001,2020-08-14/2020-08-20,36.0
240065,010001,2020-08-21/2020-08-27,40.0
235164,010001,2020-08-28/2020-09-03,39.0
...,...,...,...
215400,fa96db24d19f83993a2925454eafc6fa17dd8e64270534...,2020-09-25/2020-10-01,6.0
210430,fa96db24d19f83993a2925454eafc6fa17dd8e64270534...,2020-10-02/2020-10-08,4.0
205459,fa96db24d19f83993a2925454eafc6fa17dd8e64270534...,2020-10-09/2020-10-15,13.0
200474,fa96db24d19f83993a2925454eafc6fa17dd8e64270534...,2020-10-16/2020-10-22,2.0


In [8]:
hospitalizations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254286 entries, 254671 to 195487
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype        
---  ------           --------------   -----        
 0   hospital_pk      254286 non-null  object       
 1   collection_week  254286 non-null  period[W-THU]
 2   patients         254286 non-null  float64      
dtypes: float64(1), object(1), period[W-THU](1)
memory usage: 7.8+ MB


In [9]:
# Sanity check on total number of Covid 19 hospital patients
hospitalizations['patients'].sum()

# We see here 4.7M for Aug 2020 through Jul 2021
# In comparison, CDC estimates 6.2 M over entire pandemic, so we seem to be in the ballpark

4612874.0

In [10]:
if os.path.exists('../../pickles/CDCpickles/hospitalizations.pkl'):
    os.remove('../../pickles/CDCpickles/hospitalizations.pkl')
hospitalizations.to_pickle('../../pickles/CDCpickles/hospitalizations.pkl')