# Construct county signal from facility data

https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/anag-cw7u

In [1]:
# standard library
import pickle
from datetime import date, datetime, timedelta
from os import path

# third party
from delphi_utils import GeoMapper
from numpy import nan
from pandas import concat, DataFrame

# first party
from epidata import Epidata

## Retrieve geographic mapping and facility information

In [2]:
gmpr = GeoMapper()
us_states = set(gmpr.get_geo_values("state_id"))
excluded_states = {"fm", "gu", "vi", "as", "pw", 
                   "pr", "mp", "mh", "um", "dc"}
us_states = list(sorted(us_states - excluded_states))

fips_info = DataFrame({"fips": sorted(list(gmpr.get_geo_values("fips")))})
fips_info = gmpr.add_geocode(fips_info, "fips", "state", "fips", "state")
fips_info = fips_info[~fips_info.fips.str.endswith('000')]
fips_info = fips_info[fips_info.state_id.isin(us_states)]

In [3]:
# get all facility IDs
def extract_facility_info(state_id):
    facility_info = Epidata.check(Epidata.covid_hosp_facility_lookup(state=state_id))
    facility_info_df = []
    for facility in facility_info["epidata"]:
        facility_info_df.append(DataFrame(facility, index=[0]))
    return concat(facility_info_df)

facility_info_file = "./facility_info.p"
if path.exists(facility_info_file):
    facility_info = pickle.load(open(facility_info_file, "rb"))
else:
    facility_info = []
    for state_id in us_states:
        facility_info.append(extract_facility_info(state_id))
    facility_info = concat(facility_info, ignore_index=True)
    pickle.dump(facility_info, open(facility_info_file, "wb"))

## Retrieve data 

Example call to retrieve county signal on a particular collection date.

In [4]:
state_signal_name = "previous_day_admission_adult_covid_confirmed"
facility_signal_name = "previous_day_admission_adult_covid_confirmed_7_day_sum"
NA_code = -999999
def get_state_data(state_id, collection_date):
    """Calculate state signal, summed over the next week."""
    to_epidate = lambda x: int(str(x).replace("-", ""))
    next_week_date = to_epidate(datetime.strptime(str(collection_date), "%Y%m%d").date() + timedelta(6))
    res = Epidata.covid_hosp(state_id, Epidata.range(collection_date, next_week_date))
    if not len(res["epidata"]) == 7:
        return nan
    return sum([row[state_signal_name] for row in res["epidata"]])

def get_facility_data(state_id, collection_date, facility_info=facility_info):
    """Return facility data in a given state and collection week."""
    
    state_facility_ids = facility_info[facility_info.state.eq(state_id.upper())].hospital_pk.values
    
    # facility IDs should be 6 digits, but are occasionally smaller if leading 
    # zeros are dropped in preprocessing
    facility_ids = set([f.zfill(6) for f in state_facility_ids if len(f) <= 6])
    
    out = {}
    for facility_id in facility_ids:
        res = Epidata.covid_hosp_facility(facility_id, collection_date)
        if res["result"] != 1:
            print(facility_id, "failed")
            continue
        val = res["epidata"][0][facility_signal_name]
        val = nan if val == NA_code else val # censoring if count is less than 4
        out[facility_id] = val
    return out
        
    
state_id = "nc"
collection_date = 20201204
data = {**{state_id: get_state_data(state_id, collection_date)},
        **get_facility_data(state_id, collection_date)}
print(len(data))

34011F failed
112


In [5]:
# reformat data
data = DataFrame.from_dict(data, orient="index").reset_index()
data.columns = ["hospital_pk", "7_day_sum"]
data = data.merge(
    facility_info[["hospital_pk", "fips_code"]],
    how="left", on="hospital_pk",
)
data.fips_code = data.fips_code.apply(lambda x: str(x).zfill(5))
counties_in_state = fips_info[fips_info.state_id.eq(state_id)].fips.values
state_data = data[data.fips_code.isin(counties_in_state) & ~data["7_day_sum"].isnull()]
state_data = state_data.groupby("fips_code").sum().reset_index()

# calculate discrepancy between state total and facility report
missing_counts = (data[data.hospital_pk.eq(state_id)]["7_day_sum"] - state_data["7_day_sum"].sum())[0]

# unreported counties (through censoring or otherwise)
missing_counties = set(counties_in_state) - set(state_data.fips_code)

# disperse total across all missing counties
missing_data = DataFrame({
    "fips_code": list(missing_counties),
    "7_day_sum": missing_counts/len(missing_counties)
})

state_data = concat([state_data[["fips_code", "7_day_sum"]], missing_data], ignore_index=True)
state_data["collection_date"] = collection_date

In [6]:
state_data

Unnamed: 0,fips_code,7_day_sum,collection_date
0,37001,47.000000,20201204
1,37005,0.000000,20201204
2,37009,11.000000,20201204
3,37011,0.000000,20201204
4,37013,12.000000,20201204
...,...,...,...
95,37043,1.137931,20201204
96,37185,1.137931,20201204
97,37115,1.137931,20201204
98,37095,1.137931,20201204
