# Merge facility information

Merge facility data from HCRIS (Healthcare Cost Reporting Information System) andDH (Definitive Healthcare) datasets.

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np

from covidcaremap.geo import spatial_join_facilities
from covidcaremap.data import processed_data_path

from covidcaremap.mapping import HospMap
from covidcaremap.merge import Matcher

In [4]:
hcris = gpd.read_file(processed_data_path('usa_hospital_beds_hcris2018.geojson'), encoding='utf-8')
dh = gpd.read_file(processed_data_path('dh_geocoded_v1_0326202.geojson'), encoding='utf-8')

In [5]:
hcris.rename(columns={
    'State': 'STATE_NAME',
    'County': 'COUNTY_NAME',
    'City': 'CITY_NAME',
    'Zip_Code': 'ZIP_CODE'
}, inplace=True)

dh['STATE_NAME'] = dh['ST_ABBR']

In [6]:
state_matches = {}
for state in hcris['STATE_NAME'].unique():
    print('Matching facilities in {}'.format(state))
    if state not in state_matches.keys():
        hcris_s = hcris[hcris['STATE_NAME'] == state].reset_index().copy()
        dh_s = dh[dh['STATE_NAME'] == state].reset_index().copy()
        m = Matcher(hcris_s, dh_s, 'Provider Number', 'OBJECTID')
        m.match_point_set((100, 500), 10)
        all_map = m.map_all(('HCRIS', 'DH'), ['match source', 'dist_apart'])
        all_map.add_layer_selector()
        all_map.save(processed_data_path('state_validation_maps/{}.html'.format(state)))
        state_matches[state] = m

Matching facilities in AL
Completed matching and deduping facilities, matched 109 of 136
Matching facilities in AK
Completed matching and deduping facilities, matched 24 of 26
Matching facilities in AZ
Completed matching and deduping facilities, matched 106 of 125
Matching facilities in AR
Completed matching and deduping facilities, matched 97 of 109
Matching facilities in CA
Completed [250] of 445 facilities, prelim matched 237
Completed matching and deduping facilities, matched 403 of 445
Matching facilities in CO
Completed matching and deduping facilities, matched 101 of 110
Matching facilities in CT
Completed matching and deduping facilities, matched 40 of 43
Matching facilities in DE
Completed matching and deduping facilities, matched 14 of 16
Matching facilities in DC
Completed matching and deduping facilities, matched 13 of 13
Matching facilities in FL
Completed [250] of 273 facilities, prelim matched 239
Completed matching and deduping facilities, matched 248 of 273
Matching fa

In [None]:
hcris_matched = []
dh_matched = []
hcris_unmatched = []
dh_unmatched = []

for _, v in state_matches.items():
    hcris_matched.append(v.d1_matched)
    dh_matched.append(v.d2_matched)
    hcris_unmatched.append(v.d1_unmatched)
    dh_unmatched.append(v.d2_unmatched)

hcris_matched = pd.concat(hcris_matched).to_crs('epsg:4326')
dh_matched = pd.concat(dh_matched).to_crs('epsg:4326')
hcris_unmatched = pd.concat(hcris_unmatched).to_crs('epsg:4326')
dh_unmatched = pd.concat(dh_unmatched).to_crs('epsg:4326')

In [None]:
m = HospMap()
m.add_point_subset(hcris_matched, 'hcris - matched', 'blue', addl_labels=['match source', 'dist_apart'])
m.add_point_subset(dh_matched, 'dh - matched', 'red', addl_labels=['match source', 'dist_apart'])
m.add_point_subset(dh_unmatched, 'dh - unmatched', 'orange')
m.add_point_subset(hcris_unmatched, 'hcris - unmatched', 'purple')
m.add_connections(hcris_matched, dh_matched)

#### Filter facilities with unusable data.


In [None]:
dh_gdf = dh_gdf[~dh_gdf['geometry'].isna()]
dh_gdf = dh_gdf[~dh_gdf['NUM_LICENS'].isnull()]
hcris_gdf = hcris_gdf[hcris_gdf['Total Staffed Beds'] > 0.0]

Perform the matching on facilities based on a spatial join and similarity score between address and name columns.

In [None]:
id_columns = {
    'HCRIS': 'Provider Number',
    'DH': 'OBJECTID'
}

similarity_columns = {
    'HCRIS': ['HOSP10_Name', 'Street_Addr'],
    'DH': ['HOSPITAL_N', 'HQ_ADDRESS']
}

In [None]:
hcris_filtered_gdf = hcris_gdf[['geometry', id_columns['HCRIS']] + similarity_columns['HCRIS']]
dh_filtered_gdf = dh_gdf[['geometry', id_columns['DH']] + similarity_columns['DH']]

In [None]:
joined_dh_hcris = spatial_join_facilities(left=dh_filtered_gdf,
                        right=hcris_filtered_gdf,
                        lid_property = id_columns['DH'],
                        rid_property = id_columns['HCRIS'],
                        lsimilarity_properties = similarity_columns['DH'],
                        rsimilarity_properties = similarity_columns['HCRIS'],
                        similarity_weights=[0.6, 0.4],                                                       
                        distance=1000,
                        merge_unmatched=False)

#### Save off unmatched HCRIS data to be inspected 

In [None]:
matched_hcris = set(set(joined_dh_hcris[~joined_dh_hcris['Provider Number'].isnull()]['Provider Number']))
total_hcris = set(hcris_gdf['Provider Number'])
unmatched_hcris = hcris_gdf[hcris_gdf['Provider Number'].isin(total_hcris - matched_hcris)]

In [None]:
unmatched_hcris.to_csv(processed_data_path('hcris-unmatched-to-dh.csv'))

#### Save merged facility data

In [None]:
full_df = joined_dh_hcris.merge(dh_gdf, on='OBJECTID').merge(hcris_gdf, how='left', on='Provider Number')
full_df = full_df.drop(columns=['geometry_y'])
full_df = full_df.rename({'geometry_x': 'geometry'}, axis=1)
full_df = gpd.GeoDataFrame(full_df, crs=4326)

In [None]:
full_df.to_file(processed_data_path('dh_hcris_merged_facility_data.geojson'), 
                encoding='utf-8', 
                driver='GeoJSON')