# Merge facility information

Merge facility data from HCRIS (Healthcare Cost Reporting Information System) andDH (Definitive Healthcare) datasets.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from os.path import join

from covidcaremap.geo import spatial_join_facilities
from covidcaremap.data import processed_data_path

from covidcaremap.mapping import HospMap
from covidcaremap.merge import Matcher

In [None]:
hcris = gpd.read_file(processed_data_path('usa_facilities_hcris_geocoded.geojson'), encoding='utf-8')
dh = gpd.read_file(processed_data_path('dh_geocoded_v1_0326202.geojson'), encoding='utf-8')

In [None]:
hcris.rename(columns={
    'ST_ABBR': 'STATE_NAME',
    'Zip_Code': 'ZIP_CODE'
}, inplace=True)

dh['STATE_NAME'] = dh['ST_ABBR']

In [None]:
all_states = hcris['STATE_NAME'].unique()

In [None]:
def match_by_state(d1, d2, uid1, uid2, str_match_method, map_dir=None):
    state_matches = {}
    for state in all_states:
        print('Matching facilities in {}'.format(state))
        if state not in state_matches.keys():
            d1_s = d1[d1['STATE_NAME'] == state].reset_index().copy()
            d2_s = d2[d2['STATE_NAME'] == state].reset_index().copy()
            m = Matcher(d1_s, d2_s, uid1, uid2)
            m.match_point_set((100, 500), 10, str_match_method=str_match_method)
            if map_dir:
                all_map = m.map_all((uid1, uid2), ['match source', 'dist_apart'])
                all_map.add_layer_selector()
                all_map.save(join(processed_data_path('{}'.format(map_dir)), '{}.html'.format(state)))
            state_matches[state] = m
    
    ds = {
        'd1_matched': [],
        'd2_matched': [],
        'd1_unmatched': [],
        'd2_unmatched': [],
        'matching_dfs': []
    }
    
    for _, v in state_matches.items():
        ds['d1_matched'].append(v.d1_matched)
        ds['d2_matched'].append(v.d2_matched)
        ds['d1_unmatched'].append(v.d1_unmatched)
        ds['d2_unmatched'].append(v.d2_unmatched)
        ds['matching_dfs'].append(v.matching_key_df())
    
    ds = {k: pd.concat(v).to_crs('epsg:4326') for k, v in ds.items()}
    
    return (state_matches, ds)

In [None]:
hcris_to_dh_state_matches, hcris_to_dh_data = match_by_state(
    hcris, dh, 'Provider Number', 'OBJECTID',
    'name', 'state_validation_maps_03-31-20_hcris-to-dh')

matches = hcris_to_dh_data['matching_dfs']
matched.to_csv(processed_data_path('HCRIS_to_DH_matching_key.cvs'), index=False)

print('HCRIS to DH matches: {}'.format(len(matched)))
print('HCRIS to DH matches: {}'.format(len(hcris_to_dh_data['d1_unmatched'])))

In [None]:
dh_to_hcris_state_matches, dh_to_hcris_data = match_by_state(
    dh, hcris, 'OBJECTID', 'Provider Number', 
    'name', 'state_validation_maps_03-31-20_dh-to-hcris')

#### Filter facilities with unusable data.


In [None]:
dh_gdf = dh_gdf[~dh_gdf['geometry'].isna()]
dh_gdf = dh_gdf[~dh_gdf['NUM_LICENS'].isnull()]
hcris_gdf = hcris_gdf[hcris_gdf['Total Staffed Beds'] > 0.0]

Perform the matching on facilities based on a spatial join and similarity score between address and name columns.

In [None]:
id_columns = {
    'HCRIS': 'Provider Number',
    'DH': 'OBJECTID'
}

similarity_columns = {
    'HCRIS': ['HOSP10_Name', 'Street_Addr'],
    'DH': ['HOSPITAL_N', 'HQ_ADDRESS']
}

In [None]:
hcris_filtered_gdf = hcris_gdf[['geometry', id_columns['HCRIS']] + similarity_columns['HCRIS']]
dh_filtered_gdf = dh_gdf[['geometry', id_columns['DH']] + similarity_columns['DH']]

In [None]:
joined_dh_hcris = spatial_join_facilities(left=dh_filtered_gdf,
                        right=hcris_filtered_gdf,
                        lid_property = id_columns['DH'],
                        rid_property = id_columns['HCRIS'],
                        lsimilarity_properties = similarity_columns['DH'],
                        rsimilarity_properties = similarity_columns['HCRIS'],
                        similarity_weights=[0.6, 0.4],                                                       
                        distance=1000,
                        merge_unmatched=False)

#### Save off unmatched HCRIS data to be inspected 

In [None]:
matched_hcris = set(set(joined_dh_hcris[~joined_dh_hcris['Provider Number'].isnull()]['Provider Number']))
total_hcris = set(hcris_gdf['Provider Number'])
unmatched_hcris = hcris_gdf[hcris_gdf['Provider Number'].isin(total_hcris - matched_hcris)]

In [None]:
unmatched_hcris.to_csv(processed_data_path('hcris-unmatched-to-dh.csv'))

#### Save merged facility data

In [None]:
full_df = joined_dh_hcris.merge(dh_gdf, on='OBJECTID').merge(hcris_gdf, how='left', on='Provider Number')
full_df = full_df.drop(columns=['geometry_y'])
full_df = full_df.rename({'geometry_x': 'geometry'}, axis=1)
full_df = gpd.GeoDataFrame(full_df, crs=4326)

In [None]:
full_df.to_file(processed_data_path('dh_hcris_merged_facility_data.geojson'), 
                encoding='utf-8', 
                driver='GeoJSON')