# Merge facility information

Merge facility data from HCRIS (Healthcare Cost Reporting Information System) andDH (Definitive Healthcare) datasets.

In [43]:
import pandas as pd
import geopandas as gpd

from hscap.geo import spatial_join_facilities

In [44]:
hcris_gdf = gpd.read_file('../data/usa_hospital_beds_hcris2018_v2.geojson', encoding='utf-8')
dh_gdf = gpd.read_file('../data/dh_facility_data.geojson', encoding='utf-8')

#### Filter facilities with unusable data.


In [47]:
dh_gdf = dh_gdf[~dh_gdf['geometry'].isna()]
dh_gdf = dh_gdf[~dh_gdf['NUM_LICENS'].isnull()]
hcris_gdf = hcris_gdf[hcris_gdf['Total Staffed Beds'] > 0.0]

Perform the matching on facilities based on a spatial join and similarity score between address and name columns.

In [48]:
id_columns = {
    'HCRIS': 'Provider Number',
    'DH': 'OBJECTID'
}

similarity_columns = {
    'HCRIS': ['HOSP10_Name', 'Street_Addr'],
    'DH': ['HOSPITAL_N', 'HQ_ADDRESS']
}

In [49]:
hcris_filtered_gdf = hcris_gdf[['geometry', id_columns['HCRIS']] + similarity_columns['HCRIS']]
dh_filtered_gdf = dh_gdf[['geometry', id_columns['DH']] + similarity_columns['DH']]

In [50]:
joined_dh_hcris = spatial_join_facilities(left=dh_filtered_gdf,
                        right=hcris_filtered_gdf,
                        lid_property = id_columns['DH'],
                        rid_property = id_columns['HCRIS'],
                        lsimilarity_properties = similarity_columns['DH'],
                        rsimilarity_properties = similarity_columns['HCRIS'],
                        similarity_weights=[0.6, 0.4],                                                       
                        distance=1000,
                        merge_unmatched=False)

#### Save off unmatched HCRIS data to be inspected 

In [51]:
matched_hcris = set(set(joined_dh_hcris[~joined_dh_hcris['Provider Number'].isnull()]['Provider Number']))
total_hcris = set(hcris_gdf['Provider Number'])
unmatched_hcris = hcris_gdf[hcris_gdf['Provider Number'].isin(total_hcris - matched_hcris)]

In [52]:
unmatched_hcris.to_csv('../data/hcris-unmatched-to-dh.csv')

#### Save merged facility data

In [53]:
full_df = joined_dh_hcris.merge(dh_gdf, on='OBJECTID').merge(hcris_gdf, how='left', on='Provider Number')
full_df = full_df.drop(columns=['geometry_y'])
full_df = full_df.rename({'geometry_x': 'geometry'}, axis=1)
full_df = gpd.GeoDataFrame(full_df, crs=4326)

In [54]:
full_df.to_file('../data/dh_hcris_merged_facility_data.geojson', 
                encoding='utf-8', 
                driver='GeoJSON')