In [None]:
!wget https://opendata.arcgis.com/datasets/6ac5e325468c4cb9b905f1728d6fbf0f_0.csv -O hifld-hospitals.csv

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
hifld_df = pd.read_csv('hifld-hospitals.csv', dtype=str)

In [None]:
hifld_df.dtypes

In [None]:
hifld_df.head()

In [None]:
hifld_gdf = gpd.GeoDataFrame(hifld_df, crs='EPSG:4326', geometry=gpd.points_from_xy(hifld_df.X, hifld_df.Y))

In [None]:
hifld_gdf = hifld_gdf.add_prefix('hifld_')

In [None]:
hifld_gdf['geomtry'] = hifld_gdf.hifld_geometry

In [None]:
hifld_gdf = hifld_gdf.set_geometry('geomtry')

In [None]:
hifld_gdf.plot()

In [None]:
hcris_hospital_beds_gdf = gpd.read_file('../data/usa_hospital_beds_hcris2018_cleaned3.geojson')

In [None]:
hcris_hospital_beds_gdf.head()

In [None]:
len(hcris_hospital_beds_gdf)

In [None]:
len(hifld_gdf)

In [None]:
hifld_gdf = hifld_gdf.to_crs("EPSG:3857")

In [None]:
hcris_hospital_beds_gdf = hcris_hospital_beds_gdf.to_crs("EPSG:3857")

In [None]:
hcris_hospital_beds_gdf["geom_buffered"] = hcris_hospital_beds_gdf.geometry.buffer(150)

In [None]:
hcris_hospital_beds_gdf_copy = hcris_hospital_beds_gdf.copy()

In [None]:
hcris_hospital_beds_gdf_copy.insert(0, 'ID', range(0, len(hcris_hospital_beds_gdf_copy)))

In [None]:
hcris_hospital_beds_gdf_copy.head()

In [None]:
hcris_hospital_beds_gdf_copy["point_geometry"] = hcris_hospital_beds_gdf_copy.geometry

In [None]:
hcris_hospital_beds_gdf_copy["geometry"] = hcris_hospital_beds_gdf_copy["geom_buffered"]

In [None]:
hcris_hospital_beds_gdf_copy = hcris_hospital_beds_gdf_copy.set_geometry('geometry')

In [None]:
hcris_hospital_beds_gdf.plot(figsize=(15, 10))

In [None]:
joined = gpd.sjoin(hcris_hospital_beds_gdf_copy, hifld_gdf, how="left", op="intersects")

In [None]:
joined.head()

In [None]:
len(joined)

In [None]:
joined.loc[0, :]

In [None]:
joined['hifld_ID'].nunique()

In [None]:
joined = joined.set_geometry("geometry")

In [None]:
joined['ID'].nunique()

In [None]:
len(joined)

In [None]:
len(hcris_hospital_beds_gdf_copy)

In [None]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
joined["name_similarity"] = joined.apply(lambda row: similar(str(row["HOSP10_Name"]), str(row["hifld_NAME"])), axis = 1) 

In [None]:
joined["address_similarity"] = joined.apply(lambda row: similar(str(row["Street_Addr"]), str(row["hifld_ADDRESS"])), axis = 1) 

In [None]:
joined.head()

In [None]:
len(joined)

#### Records without null data from hifld (6210)

In [None]:
joined_no_null = joined[joined['index_right'].notnull()]
len(joined_no_null)

#### Records with null data from hifld (909). These records' licensed bed counts will be estimated by state average.

In [None]:
joined_null = joined[joined['index_right'].isnull()]
len(joined_null)

#### Among the 6210 records without null data from hifld, there are 893 duplicates from hcris due to the spatial join

In [None]:
def getDupeRecords(df, field):
    ids = df[field]
    return df[ids.isin(ids[ids.duplicated()])]

In [None]:
joined_no_null_dupe = getDupeRecords(joined_no_null, "ID")
len(joined_no_null_dupe)

In [None]:
ids_no_null = joined_no_null['ID']
joined_no_null_no_dupe = joined_no_null[~ids_no_null.isin(ids_no_null[ids_no_null.duplicated()])]
len(joined_no_null_no_dupe)

First, among duplicates, find the records with higher address matching score

In [None]:
address_similarity_maxes = joined_no_null_dupe.groupby(['ID']).address_similarity.transform(max)
joined_no_null_dedupe_address = joined_no_null_dupe[(joined_no_null_dupe.address_similarity == address_similarity_maxes)]
len(joined_no_null_dedupe_address)

Then, among the above result, find the records with higher naming matching score

In [None]:
name_similarity_maxes = joined_no_null_dedupe_address.groupby(['ID']).name_similarity.transform(max)
joined_no_null_dedupe_address_name = joined_no_null_dedupe_address[(joined_no_null_dedupe_address.name_similarity == name_similarity_maxes)]
len(joined_no_null_dedupe_address_name)

In [None]:
joined_no_null_deduped = joined_no_null_no_dupe.append(joined_no_null_dedupe_address_name)

In [None]:
len(joined_no_null_deduped)

In [None]:
joined_no_null_deduped["icu_to_licensed"] = joined_no_null_deduped["ICU Total Beds"] / joined_no_null_deduped["hifld_BEDS"]

In [None]:
joined_no_null_deduped.head()

In [None]:
joined_no_null_deduped['is_hifld_BEDS_estimated'] = 0

In [None]:
len(joined_no_null_deduped)

In [None]:
joined_no_null_deduped.head()

In [None]:
icu_to_licensed_state_avg = joined_no_null_deduped.groupby(['State'])['icu_to_licensed'].mean().reset_index()

In [None]:
joined_null_with_ratio = joined_null.merge(icu_to_licensed_state_avg, on="State")

In [None]:
len(joined_null_with_ratio)

In [None]:
not_joined_with_null = joined_null[~joined_null.ID.isin(list(joined_null_with_ratio['ID']))].copy()

In [None]:
not_joined_with_null.head()

In [None]:
joined_null_with_ratio['hifld_BEDS'] = joined_null_with_ratio["ICU Total Beds"]/joined_null_with_ratio["icu_to_licensed"]

In [None]:
joined_null_with_ratio['hifld_BEDS'] = joined_null_with_ratio['hifld_BEDS'].astype(int)

In [None]:
joined_null_with_ratio['is_hifld_BEDS_estimated'] = 1

In [None]:
len(joined_null_with_ratio)

In [None]:
not_joined_with_null['is_hifld_BEDS_estimated'] = 9

In [None]:
result = joined_no_null_deduped.append(joined_null_with_ratio).append(not_joined_with_null)

In [None]:
result = result.sort_values(by=['ID'])

In [None]:
result = result.to_crs("EPSG:4326")

In [None]:
result["geom_buffered"] = result["geometry"]

In [None]:
result = result.to_crs("EPSG:3857")

In [None]:
result["geometry"] = result["point_geometry"]

In [None]:
result = result.set_geometry("geometry")

In [None]:
result = result.to_crs("EPSG:4326")

In [None]:
result["point_geometry"] = result["geometry"]

In [None]:
result.plot(figsize=(15, 10))

In [None]:
result.drop(['point_geometry', 'hifld_X', 'hifld_Y', 'hifld_geometry'], axis=1, inplace=True)

In [None]:
result.columns

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
object_cols = list(result.select_dtypes(include=['object']).columns)
for col in object_cols:
    result[col] = result[col].astype(str).str.replace('/',' ')

In [None]:
result.dtypes

In [None]:
# make gdf out of hosp_df and geocoder results, drop row with the huge # of reported beds, save to geojson
result.to_file('usa_hospital_beds_hcris2018_merge_hifld.geojson', encoding='utf-8', driver='GeoJSON')


