# Merge facility information

Merge facility data from HCRIS (Healthcare Cost Reporting Information System) and DH (Definitive Healthcare) datasets.

In [None]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from os.path import join, isdir

from covidcaremap.geo import spatial_join_facilities
from covidcaremap.data import (processed_data_path, 
                               external_data_path,
                               local_data_path)
from covidcaremap.mapping import HospMap
from covidcaremap.merge import Matcher

Geocoded datasets

In [None]:
hcris = gpd.read_file(processed_data_path('usa_facilities_hcris_geocoded.geojson'), encoding='utf-8')
dh = gpd.read_file(processed_data_path('dh_geocoded_v1_0326202.geojson'), encoding='utf-8')

Rename columns to match between datasets

In [None]:
hcris.rename(columns={
    'ST_ABBR': 'STATE_NAME',
    'Zip_Code': 'ZIP_CODE'
}, inplace=True)

dh['STATE_NAME'] = dh['ST_ABBR']

List of states to iterate over

In [None]:
all_states = hcris['STATE_NAME'].unique()

Collect data set, data set name, and data set uid for matching

In [None]:
hcris_info = (hcris, 'HCRIS', 'Provider Number')
dh_info = (dh, 'DH', 'OBJECTID')

In [None]:
def match_by_state(facility_info_1, facility_info_2, map_dir=None, str_match_method='name'):
    df1, name1, uid1 = facility_info_1
    df2, name2, uid2 = facility_info_2
    
    if not os.path.isdir(processed_data_path(map_dir)):
        os.makedirs(processed_data_path(map_dir))
    
    state_matches = {}
    for state in all_states:
        print('Matching facilities in {}'.format(state))
        if state not in state_matches.keys():
            df1_s = df1[df1['STATE_NAME'] == state].reset_index().copy()
            df2_s = df2[df2['STATE_NAME'] == state].reset_index().copy()
            m = Matcher(df1_s, df2_s, uid1, uid2)
            m.match_point_set((100, 500), n=10, str_match_method=str_match_method)
            if map_dir:
                all_map = m.map_all((name1, name2), ['match source', 'dist_apart'])
                all_map.add_layer_selector()
                all_map.save(join(processed_data_path('{}'.format(map_dir)), '{}.html'.format(state)))
            state_matches[state] = m
    
    ds = {
        f'{name1}_matched': [],
        f'{name2}_matched': [],
        f'{name1}_unmatched': [],
        f'{name2}_unmatched': [],
        'matching_dfs': []
    }
    
    for _, v in state_matches.items():
        ds[f'{name1}_matched'].append(v.d1_matched)
        ds[f'{name2}_matched'].append(v.d2_matched)
        ds[f'{name1}_unmatched'].append(v.d1_unmatched)
        ds[f'{name2}_unmatched'].append(v.d2_unmatched)
        ds['matching_dfs'].append(v.matching_key_df())
    
    for k, v in ds.items():
        ds[k] = pd.concat(v)
        if isinstance(ds[k], gpd.GeoDataFrame):
            ds[k] = ds[k].to_crs('epsg:4326')
    
    print('------------')
    n_matched = len(ds['matching_dfs'])
    n_unmatched = len(ds[f'{name1}_unmatched'])
    n_total = n_matched + n_unmatched
    pct_matched = round((n_matched / n_total) * 100, 1)
    print(f'{name1} to {name2} matches: {pct_matched}% ({n_matched} of {n_total})')
    
    return state_matches, ds

Match the two data sets state by state, writing off folium maps

In [None]:
hcris_to_dh_matches, hcris_to_dh_data = match_by_state(
    hcris_info, 
    dh_info, 
    'state_validation_maps_03-31-21_hcris-to-dh')

HtoD_matches = hcris_to_dh_data['matching_dfs'].astype(str)
HtoD_matches.to_csv(local_data_path('HCRIS_to_DH_matching_key.csv'), index=False)

### Combine into one dataset

Some field names changed in the process of geocoding so we need to import the original datasets and join them using the newly created matching key to ensure that column names are consistent

In [None]:
hcris_original = gpd.read_file(processed_data_path('usa_hospital_beds_hcris2018.geojson'), encoding='utf-8')
hcris_original['Provider Number'] = hcris_original['Provider Number'].astype(str)
dh_original = gpd.read_file(external_data_path('dh_facility_data.geojson'), encoding='utf-8')
dh_original['OBJECTID'] = dh_original['OBJECTID'].astype(str)

Add the `Provider Number` (HCRIS unique ID field) to the DH dataset by joining the key to it on `OBJECTID`

In [None]:
dh_with_pn = pd.merge(dh_original, HtoD_matches[['OBJECTID', 'Provider Number']], how='left', on='OBJECTID')

Then join the HCRIS dataset to the updated DH dataset on `Provider Number`

In [None]:
full_df = pd.merge(dh_with_pn, hcris_original.drop(columns='geometry'), how='left', on='Provider Number', suffixes=('', '_HCRIS'))

Select the necessary columns by name

In [None]:
cols = ['OBJECTID', 'Provider Number', 'HOSPITAL_N', 'HOSPITAL_T', 'HQ_ADDRESS',
        'HQ_ADDRE_1', 'HQ_CITY', 'HQ_STATE', 'HQ_ZIP_COD', 'COUNTY_NAM',
        'STATE_NAME', 'STATE_FIPS', 'CNTY_FIPS', 'FIPS', 'NUM_LICENS',
        'NUM_STAFFE', 'NUM_ICU_BE', 'BED_UTILIZ', 'Potential_', 'FYB', 'FYE',
        'STATUS', 'CTRL_TYPE', 'HOSP10_Name', 'Street_Addr', 'PO_Box', 'City',
        'State', 'Zip_Code', 'County', 'Hospital Adult and Peds Staffed Beds',
        'Hospital Adult and Peds Bed Days Available',
        'Hospital Adult and Peds Inpatient Days',
        'Intensive Care Unit Staffed Beds',
        'Intensive Care Unit Bed Days Available',
        'Intensive Care Unit Inpatient Days', 'Coronary Care Unit Staffed Beds',
        'Coronary Care Unit Bed Days Available',
        'Coronary Care Unit Inpatient Days', 'Burn ICU Staffed Beds',
        'Burn ICU Bed Days Available', 'Burn ICU Inpatient Days',
        'Surgical ICU Staffed Beds', 'Surgical ICU Bed Days Available',
        'Surgical ICU Inpatient Days', 'Total Staffed Beds',
        'Total Bed Days Available', 'Total Inpatient Days',
        'ICU Total Staffed Beds', 'ICU Total Bed Days Available',
        'ICU Total Inpatient Days', 'ICU Occupancy Rate',
        'Total Bed Occupancy Rate', 'geometry']

full_df = full_df[cols]

Write off the merged geojson

In [None]:
full_df.to_file(processed_data_path('dh_hcris_merged_facility_data.geojson'), 
                encoding='utf-8', 
                driver='GeoJSON')