# Construct county signal from facility data

https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/anag-cw7u

There is a 10 day delay between collection date and publication date (the collection date starts on a Friday, and covers the following 7 days), so the actual delay to publication is 3 days. 

In [1]:
# standard library
import pickle
from datetime import date, datetime, timedelta
from os import path

# third party
from delphi_utils import GeoMapper
from numpy import nan
from pandas import concat, DataFrame

# first party
from epidata import Epidata

## Retrieve geographic mapping and facility information

In [2]:
gmpr = GeoMapper()
us_states = set(gmpr.get_geo_values("state_id"))
excluded_states = {"fm", "gu", "vi", "as", "pw", 
                   "pr", "mp", "mh", "um", "dc"}
us_states = list(sorted(us_states - excluded_states))

fips_info = DataFrame({"fips": sorted(list(gmpr.get_geo_values("fips")))})
fips_info = gmpr.add_geocode(fips_info, "fips", "state", "fips", "state")
fips_info = fips_info[~fips_info.fips.str.endswith('000')]
fips_info = fips_info[fips_info.state_id.isin(us_states)]

In [3]:
# get all facility IDs
def extract_facility_info(state_id):
    facility_info = Epidata.check(Epidata.covid_hosp_facility_lookup(state=state_id))
    facility_info_df = []
    for facility in facility_info["epidata"]:
        facility_info_df.append(DataFrame(facility, index=[0]))
    return concat(facility_info_df)

facility_info_file = "./facility_info.p"
if path.exists(facility_info_file):
    facility_info = pickle.load(open(facility_info_file, "rb"))
else:
    facility_info = []
    for state_id in us_states:
        facility_info.append(extract_facility_info(state_id))
    facility_info = concat(facility_info, ignore_index=True)
    pickle.dump(facility_info, open(facility_info_file, "wb"))

## Retrieve data 

Example call to retrieve county signal on a particular collection date.

In [6]:
to_date = lambda x: datetime.strptime(str(x), "%Y%m%d").date()
to_epidate = lambda x: int(str(x).replace("-", ""))
state_signal_name = "previous_day_admission_adult_covid_confirmed"
facility_signal_name = "previous_day_admission_adult_covid_confirmed_7_day_sum"
NA_code = -999999
state_signal_lag = 3
facility_signal_lag = 10

def get_state_data(state_id, collection_date, issue_date):
    """Calculate state signal, summed over the next week."""
    to_epidate = lambda x: int(str(x).replace("-", ""))
    next_week_date = to_epidate(datetime.strptime(str(collection_date), "%Y%m%d").date() + timedelta(6))
    res = Epidata.covid_hosp(state_id, Epidata.range(collection_date, next_week_date), 
                             issues=Epidata.range(collection_date, issue_date))
    if res["result"] != 1:
        return nan
    
    out = {}
    latest_issue = {}
    for row in res["epidata"]:
        ref_date, issue_date = row["date"], row["issue"]
        if ref_date not in out:
            out[ref_date] = row[state_signal_name]
            latest_issue[ref_date] = issue_date
        elif issue_date > latest_issue[ref_date]:
            out[ref_date] = row[state_signal_name]
            latest_issue[ref_date] = issue_date
    assert len(out) == 7
    return sum([v for v in out.values()])

def get_facility_data(state_id, collection_date, publication_date, facility_info=facility_info):
    """Return facility data in a given state and collection week."""
    
    state_facility_ids = facility_info[facility_info.state.eq(state_id.upper())].hospital_pk.values
    
    # facility IDs should be 6 digits, but are occasionally smaller if leading 
    # zeros are dropped in preprocessing
    facility_ids = set([f.zfill(6) for f in state_facility_ids if len(f) <= 6])
    
    out = {}
    for facility_id in facility_ids:
        res = Epidata.covid_hosp_facility(facility_id, collection_date, 
                                          publication_dates=Epidata.range(collection_date, publication_date))
        if res["result"] != 1:
            print(facility_id, "failed")
            continue
            
        val = res["epidata"][0][facility_signal_name]
        val = nan if val == NA_code else val # censoring if count is less than 4
        out[facility_id] = val

    return out

In [7]:
def construct_county_signal(state_id, data, fips_info=fips_info):
    # reformat data
    data = DataFrame.from_dict(data, orient="index").reset_index()
    data.columns = ["hospital_pk", "7_day_sum"]
    data = data.merge(
        facility_info[["hospital_pk", "fips_code"]],
        how="left", on="hospital_pk",
    )
    data.fips_code = data.fips_code.apply(lambda x: str(x).zfill(5))
    counties_in_state = fips_info[fips_info.state_id.eq(state_id)].fips.values

    # unreported counties (through censoring)
    missing_counties = set(data[data["7_day_sum"].isnull()].fips_code)
    state_data = data[data.fips_code.isin(counties_in_state) & ~data["7_day_sum"].isnull()]
    
    # group facilities in same county
    state_data = state_data.groupby("fips_code").sum().reset_index()
    
    # calculate discrepancy between state total and facility report
    missing_counts = (data[data.hospital_pk.eq(state_id)]["7_day_sum"] - state_data["7_day_sum"].sum())[0]

    # disperse total across all missing counties
    missing_data = DataFrame({
        "fips_code": list(missing_counties),
        "7_day_sum": missing_counts/len(missing_counties)
    })

    state_data = concat([state_data[["fips_code", "7_day_sum"]], missing_data], ignore_index=True)
    return state_data

state_id = "ak"
collection_date = 20201204
publication_date = 20201214
data = {**{state_id: get_state_data(state_id, collection_date, publication_date)},
        **get_facility_data(state_id, collection_date, publication_date)}
out_df = construct_county_signal(state_id, data)
out_df["collection_date"] = to_date(collection_date)
out_df["as_of"] = to_date(publication_date)

In [8]:
out_df

Unnamed: 0,fips_code,7_day_sum,collection_date,as_of
0,2020,45.0,2020-12-04,2020-12-14
1,2090,11.0,2020-12-04,2020-12-14
2,2150,6.0,2020-12-04,2020-12-14
3,2170,17.0,2020-12-04,2020-12-14
4,2080,3.875,2020-12-04,2020-12-14
5,2280,3.875,2020-12-04,2020-12-14
6,2120,3.875,2020-12-04,2020-12-14
7,2210,3.875,2020-12-04,2020-12-14
8,2130,3.875,2020-12-04,2020-12-14
9,2260,3.875,2020-12-04,2020-12-14


In [None]:
# get list of every Friday, as county data is only released on Friday
n_weeks = 20
start_date = date(2020, 12, 4)
fridays = [to_epidate(start_date + timedelta(i*7)) for i in range(n_weeks)]
for collection_date in fridays:
    publication_date = to_epidate(to_date(collection_date) + timedelta(10))
    print(collection_date, publication_date)
    for state_id in us_states:
        data = {**{state_id: get_state_data(state_id, int(collection_date), publication_date)},
            **get_facility_data(state_id, int(collection_date), publication_date)}
        out_df = construct_county_signal(state_id, data)
        out_df["collection_date"] = to_date(collection_date)
        out_df["as_of"] = to_date(publication_date)
        pickle.dump(out_df, open(f"./{state_id}_{collection_date}_{publication_date}.p", "wb"))

20201204 20201214
010018 failed
010175 failed
040155 failed
040161 failed
030147 failed
030148 failed
030146 failed
050189 failed
050785 failed
555143 failed
050205 failed
05T782 failed
062009 failed
062013 failed
062011 failed
062014 failed
070039 failed
070040 failed
082000 failed
