In [2]:
from pathlib import Path
from pprint import pprint
import re
from tqdm import tqdm
import pandas as pd
import traceback
import time
import json
import pickle
import numpy as np

YEAR = 2021

In [3]:
dataFolder = Path('.') / '..' / 'data'
print(list(dataFolder.iterdir()))

[PosixPath('../data/.DS_Store'), PosixPath('../data/current-healthcare-facility-listing.csv')]


In [4]:
facilitiesData = pd.read_csv(Path('../data/current-healthcare-facility-listing.csv'))
display(facilitiesData)

Unnamed: 0,OSHPD_ID,FACILITY_NAME,LICENSE_NUM,FACILITY_LEVEL_DESC,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,COUNTY_CODE,COUNTY_NAME,ER_SERVICE_LEVEL_DESC,TOTAL_NUMBER_BEDS,FACILITY_STATUS_DESC,FACILITY_STATUS_DATE,LICENSE_TYPE_DESC,LICENSE_CATEGORY_DESC,LATITUDE,LONGITUDE
0,106010735,ALAMEDA HOSPITAL,140000002,Parent Facility,2070 Clinton Ave,Alameda,94501.0,1,Alameda,Emergency - Basic,101,Open,1946-01-01,Hospital,General Acute Care Hospital,37.762660,-122.253991
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,140000004,Parent Facility,2450 Ashby Ave,Berkeley,94705.0,1,Alameda,Emergency - Basic,339,Open,1946-01-02,Hospital,General Acute Care Hospital,37.856450,-122.257430
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,140000015,Parent Facility,747 52ND ST,OAKLAND,94609.0,1,Alameda,Emergency - Basic,206,Open,1946-01-01,Hospital,General Acute Care Hospital,37.837220,-122.267470
3,106010811,FAIRMONT HOSPITAL,140000046,Consolidated Facility,15400 Foothill Blvd,San Leandro,94578.0,1,Alameda,,109,Open,1953-07-01,Hospital,General Acute Care Hospital,37.706480,-122.118190
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,140000004,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704.0,1,Alameda,,68,Open,1946-01-01,Hospital,General Acute Care Hospital,37.863730,-122.269840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9170,406565115,PROVIDER LINK HOSPICE AND PALLIATIVE CARE INC,550007596,Parent Facility,400 Rosewood Ave,Camarillo,93010.0,56,Ventura,Not Applicable,,Open,2021-12-20,Home Health Agency/Hospice,Hospice,34.220050,-119.061110
9171,406572249,WOODLAND HEALTHCARE HOME HEALTH,100000184,Parent Facility,1207 Fairchild CT,Woodland,95695.0,57,Yolo,Not Applicable,,Open,2010-05-01,Home Health Agency/Hospice,Home Health Agency,38.666182,-121.792356
9172,406574018,YOLO HOSPICE,100000556,Parent Facility,1909 Galileo Ct,Davis,95618.0,57,Yolo,Not Applicable,,Open,2005-07-03,Home Health Agency/Hospice,Hospice,38.542130,-121.728570
9173,406584013,"FREEDOM HOME HEALTH AND HOSPICE CARE SERVICES,...",550002840,Parent Facility,519 D St,Marysville,95901.0,58,Yuba,Not Applicable,,Open,2019-03-29,Home Health Agency/Hospice,Hospice,39.141090,-121.589040


In [5]:
print(list(Path('.').iterdir()))

[PosixPath('GenerateMetadata.ipynb'), PosixPath('processing'), PosixPath('Untitled.ipynb'), PosixPath('metadata005.pickle'), PosixPath('utils.py'), PosixPath('facilityIds.pickle'), PosixPath('.ipynb_checkpoints')]


In [10]:
def savePickle(data, filepath): 
    with open(filepath, 'wb') as handle:
        pickle.dump(data, handle)

def readPickle(filepath):
    data = None
    with open(filepath, 'rb') as handle:
        data = pickle.load(handle)
    return data
    
facilityIds = pd.DataFrame(map(lambda x: x["oshpdId"], [hd for hd in readPickle('metadata005.pickle').values() if 'oshpdId' in hd]), columns=['OSHPD_ID']).dropna()
facilityIds['OSHPD_ID'] = facilityIds['OSHPD_ID'].astype(np.int64)
facilityIds, facilityIds.dtypes

(      OSHPD_ID
 0    106301098
 1    106410817
 2    106010735
 3    106190017
 4    106010937
 ..         ...
 315  106444013
 316  106301379
 317  106190883
 318  106571086
 319  106380939
 
 [320 rows x 1 columns],
 OSHPD_ID    int64
 dtype: object)

In [11]:
inDataset = pd.merge(facilitiesData,facilityIds,on='OSHPD_ID',how='inner')
# inSanFrancisco = inDataset[inDataset['COUNTY_NAME'].str.match('San Francisco') \
#                            | inDataset['COUNTY_NAME'].str.match('San Mateo') \
#                            | inDataset['COUNTY_NAME'].str.match('Santa Clara') \
#                            | inDataset['COUNTY_NAME'].str.match('Marin')]
# inSanFrancisco, len(inSanFrancisco), inSanFrancisco.dtypes
len(inDataset)

320

In [12]:
inDataset['COUNTY_NAME']

0      Alameda
1      Alameda
2      Alameda
3      Alameda
4      Alameda
        ...   
315    Ventura
316    Ventura
317    Ventura
318       Yolo
319       Yolo
Name: COUNTY_NAME, Length: 320, dtype: object

In [13]:
facilitiesData.columns

Index(['OSHPD_ID', 'FACILITY_NAME', 'LICENSE_NUM', 'FACILITY_LEVEL_DESC',
       'DBA_ADDRESS1', 'DBA_CITY', 'DBA_ZIP_CODE', 'COUNTY_CODE',
       'COUNTY_NAME', 'ER_SERVICE_LEVEL_DESC', 'TOTAL_NUMBER_BEDS',
       'FACILITY_STATUS_DESC', 'FACILITY_STATUS_DATE', 'LICENSE_TYPE_DESC',
       'LICENSE_CATEGORY_DESC', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [14]:
allIds = set(facilitiesData['OSHPD_ID'])
for oshpdId in facilityIds['OSHPD_ID']:
    if oshpdId not in allIds:
        print (oshpdId)

In [15]:
print(len(set(facilityIds['OSHPD_ID'])))

318


### Manual change notes
ID renaming on excel file name
- California Pacific Medical Center - Pacific Campus 106380929 => 106380933
- Coalinga State Hospital (typo)
- Kaiser Foundation Northern Region -> 106014326 assigned to Oakland division
- Kaiser Foundation Southern Region -> 106190429 assigned to LA division
- Modoc medical center -> 106254005 typo
- Napa State Hospital -> 106281266 typo
- Palomar Health -> 106374382
