In [1]:
# Run this cell!
import requests
import re
import pandas as pd
import geopandas as gpd
import folium
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim

In [2]:
def get_healthcareService_refs(provider_list):
    '''
    Function that extracts the HealthcareService resource reference from a list of
    provider entries (taken from the PractitionerRole resource)
    -----
    Input:
    
    provider_list (list) - List of json objects each referring to a single provider
    -----
    Output:
    
    healthcareService_refs (list) - List of healthcareService resource references
                                    for the corresponding providers
    '''
    healthcareService_refs = []
    for provider in provider_list:
        provider_entry = provider.get('resource')
        try:
            provider_service = provider_entry.get('healthcareService')[0]
            healthcareService_ref = provider_service.get('reference')
            healthcareService_refs.append(healthcareService_ref)
        except TypeError:
            healthcareService_refs.append('None')
    return healthcareService_refs

def get_specialties(healthcareServiceUrls):
    '''
    Function that takes a HealthcareService API call and extracts the provider taxonomy code
    and the corresponding plain English specialty
    -----
    Input:
    
    healthcareServiceUrls (list) - List of strings corresponding to HealthcareService resource API call
    -----
    Output:
    
    specialties (list) - 2D list of provider taxonomy codes and corresponding plain English
                         specialty
                         
            [[taxonomy_code, specialty],
             [taxonomy_code, specialty],
                   ...          ...
             [taxonomy_code, specialty]]
    '''
    specialties = []
    for url in healthcareServiceUrls:
        request = requests.get(url)
        json_object = request.json()
        specialty_object = json_object.get('specialty')
        specialty = [None, None]
        try:
            specialty_dict = specialty_object[0].get('coding')[0]
            code = specialty_dict.get('code')
            display_name = specialty_dict.get('display')
            specialty = [code, display_name]
        except TypeError:
            specialty = [None, None]
        specialties.append(specialty)
    return specialties

def get_names_addresses_and_numbers(location_urls):
    '''
    Function that extracts the provider name, street addresses, and
    phone number/contact info from Location API calls
    -----
    Input:
    
    location_urls (list) - List of strings corresponding to Location resource API calls
    -----
    Outputs:
    
    names (list) - List of provider names as strings
    
    addresses (list) - List of street addresses as strings
    
    numbers (list) - List of provider phone numbers as strings
    '''
    names = []
    addresses = []
    numbers = []
    for url in location_urls:
        request = requests.get(url)
        json_object = request.json()
        name = json_object.get('name')
        address_object = json_object.get('address')
        contact_object = json_object.get('telecom')[0]
        street_address = address_object.get('text')
        number = contact_object.get('value')
        names.append(name)
        addresses.append(street_address)
        numbers.append(number)
    return names, addresses, numbers

def clean_address(address):
    '''
    Helper function that cleans street address to allow for Nominatim 
    seach engine API call
    -----
    Input:
    
    address (string) - Street address (includes whitespaces and suite, 
                       unit, or floor)
    -----
    Output:
    
    cleaned_address (string) - Street address prepared for Nominatim
                               search engine API call
    '''
    unit_pattern = 'Fl\s[\w\d]+\s|Ste\s[\w\d]+\s|Unit\s[\w\d]+\s|Unit\s[\w\d]+\s|Rm\s[\w\d]+\s'
    try:
        clean_w_spaces = re.sub(unit_pattern, '', address)
        cleaned_address = re.sub('\s', '%20', clean_w_spaces)
        return cleaned_address
    except TypeError:
        return None

def nominatim_lookup(address):
    '''
    Helper function that makes a call to Nominatim search engine API
    -----
    Input:
    
    address (string) - Street address
    -----
    Output:
    
    coordinate (list) - List of strings corresponding to coordinates ([latitude, longitude])
    '''
    nominatim_search = 'https://nominatim.openstreetmap.org/search/'
    set_json_format = '?format=json'
    cleaned_address = clean_address(address)
    try:
        nominatim_call = nominatim_search + cleaned_address + set_json_format
        try:
            search_object = requests.get(nominatim_call).json()[0]
            latitude = search_object.get('lat')
            longitude = search_object.get('lon')
            coordinate = [latitude, longitude]
            return coordinate
        except IndexError:
            print(f'Unable to find coordinates for {address}')
            return [None, None]
    except TypeError:
        return [None, None]

def get_coordinates(addresses):
    '''
    Function that uses Nominatim search engine API to extract coordinates from
    a list of street addresses
    -----
    Input:
    
    addresses (list) - List of street addresses as strings
    -----
    Output:
    
    coordinates (list) - 2D list of coordinates (latitude, longitude)
    
            [[latitude, longitude],
             [latitude, longitude],
                ...        ...
             [latitude, longitude]]
    '''
    coordinates = []
    for address in addresses:
        coordinate = nominatim_lookup(address)
        coordinates.append(coordinate)
    return coordinates

def extract_network_name(providers_networks):
    '''
    Function that extracts the names of accepted networks given a list
    of json objects detailing the Kaiser networks that a single provider
    belongs to
    -----
    Input:
    
    providers_networks (list) - List of json objects detailing the
                                Kaiser networks that that provider
                                belongs to
    -----
    Output:
    
    translated_networks (list) - List of strings with the Kaiser networks
                                 that that provider belongs to in plain
                                 English
    '''
    translated_networks = []
    for network_object in providers_networks:
        try:
            networks_raw = [network.get('valueReference').get('identifier').get('value')
                            for network in network_object]
            networks = [kaiser_networks[raw_network] for raw_network in networks_raw]
            translated_networks.append(networks)
        except AttributeError:
            translated_networks.append(['Not accepting patients'])
    return translated_networks

def network_finder(network_token, df):
    '''
    Function that takes a string identifying which network and returns
    a list identifying whether each provider in df is a part of the specified
    network
    -----
    Inputs:
    
    network_token (str) - String idetifying the network to look for
    
    df (DataFrame) - Pandas Dataframe with one provider per row to sear in
    -----
    Output:
    
    in_network_list (list) - List specifying whether each provider in df is in
                             the network specified by network_token
    '''
    in_network_list = []
    providers_networks = df['Networks'].astype(str).values
    if network_token == 'Medicare':
        token = 'Senior Advantage'
    elif network_token == 'POS':
        token = 'Point-of-Service'
    else:
        token = network_token
    for network_set in providers_networks:
        if token in network_set:
            in_network = 1
        else:
            in_network = 0
        in_network_list.append(in_network)
    return in_network_list

def city_lookup(df_kaiser):
    '''
    Function that makes a Nominatum API call to look up the city
    from a longitude, latitude pair for an entire dataframe.
    ** A Nominatim object, `geolocator`, must be created and initialized! **
    -----
    Input:
    
    df_kaiser (DataFrame) - Pandas DataFrame of Kaiser providers
                            including columns 'Latitude' and 'Longitude'
    -----
    Output:
    
    cities (list) - List of cities as strings
    '''
    cities = []
    latitudes = df_kaiser['Latitude'].values
    longitudes = df_kaiser['Longitude'].values
    geolocator = Nominatim(user_agent="geoapiExercises")
    for i in range(len(latitudes)):
        lat = latitudes[i]
        lon = longitudes[i]
        try:
            location = geolocator.reverse(f'{lat},{lon}')
            address = location.raw['address']
            city = address.get('city')
        except ValueError:
            city = None
        cities.append(city)
    return cities

def add_carriers(df):
    '''
    Creates a list of insurance carrier for each listing in df
    List of insurance carriers to be included:
        Aetna
        Anthem
        Blue Shield of California
        Cigna
        Kaiser Permanente
        Oscar Health (more research needed)
        UnitedHealthcare
    -----
    Input:
    
    df (DataFrame) - Pandas DataFrame with provider listings
    -----
    Output:
    
    carriers (list) - Python list with insurance carrier for each
                      provider listing; each provider listing
                      should have only one designated carrier value
    '''
    carriers = []
    at = 'Aetna'
    an = 'Anthem'
    bs = 'Blue Shield'
    cg = 'Cigna'
    kp = 'Kaiser'
    os = 'Oscar Health'
    un = 'UnitedHealthcare'
    for provider in df['Networks'].values:
        provider_listing = str(provider)
        if at in provider_listing:
            carrier = at
        elif an in provider_listing:
            carrier = an
        elif bs in provider_listing:
            carrier = bs
        elif cg in provider_listing:
            carrier = cg
        elif kp in provider_listing:
            carrier = kp
        elif os in provider_listing:
            carrier = os
        elif un in provider_listing:
            carrier = un
        else:
            carrier = None
        carriers.append(carrier)
    return carriers

def add_accepting_patients_status(df):
    '''
    Add accepting patients status for each provider listing in df
    -----
    Input:
    
    df (DataFrame) - Pandas DataFrame with provider listings
    -----
    Output:
    
    active_status (list) - Python list with binary indicators
                           of active status for each provider
                           listing in df
    '''
    active_status = []
    for provider_listing in df['Carrier'].values:
        if provider_listing is None:
            status = 0
        else:
            status = 1
        active_status.append(status)
    return active_status

def add_id(df, start=1):
    '''
    Creates a list of ids for provider listings in df. A starting position
    start must be passed in to ensure that no duplicate ids occur
    -----
    Inputs:
    
    df (DataFrame) - Pandas DataFrame with provider listings
    
    start (int) - Number of the largest id (default = 0)
    -----
    Output:
    
    ids (list) - List of ids as strings
    '''
    id_numbers = []
    id_number = start
    for provider_listing in df['Carrier'].values:
        if provider_listing == 'Aetna':
            carrier_code = 'AT'
        elif provider_listing == 'Anthem':
            carrier_code = 'AN'
        elif provider_listing == 'Blue Shield':
            carrier_code = 'BS'
        elif provider_listing == 'Cigna':
            carrier_code = 'CG'
        elif provider_listing == 'Kaiser':
            carrier_code = 'KP'
        elif provider_listing == 'Oscar Health':
            carrier_code = 'OS'
        elif provider_listing == 'UnitedHealthcare':
            carrier_code = 'UN'
        else:
            carrier_code = 'XX'
        full_id = carrier_code + str(id_number)
        id_number += 1
        id_numbers.append(full_id)
    return id_numbers

In [3]:
# Initialize important variables
base_url = 'https://kpx-service-bus.kp.org/service/hp/mhpo/healthplanproviderv1rc/'
practitionerRole = 'PractitionerRole?'
search_param = 'location.address-city=Berkeley'
full_query = base_url + practitionerRole + search_param
kaiser_networks = {'Exclusive_Provider_Organization_(EPO)_CN': 'Kaiser EPO Network',
                   'HMO_CN': 'Kaiser HMO Network',
                   'Medi-Cal_Managed_Care_CN': 'Kaiser Medi-Cal Network',
                   'Point-of-Service_Plan_(POS)_CN': 'Kaiser Point-of-Service Network',
                   'Senior_Advantage_CN': 'Kaiser Senior Advantage Network'}

# Make API calls
request = requests.get(full_query)
json_object = request.json()
total = json_object.get('total')
entries_per_page = len(json_object.get('entry'))
total_pages = total // 50 + 1
print(f'Total Entries: {total}')
print(f'Entries per Page: {entries_per_page}')
page = 1
print(f'Working on Page {page}!')

# First page
provider_list = json_object.get('entry')
healthcareService_refs = get_healthcareService_refs(provider_list)
healthcareServiceUrls = [base_url + ref for ref in healthcareService_refs]
location_refs = [provider.get('resource').get('location')[0].get('reference')
                 for provider in provider_list]
location_urls = [base_url + ref for ref in location_refs]
network_objects = [providers_networks.get('resource').get('extension')[1:]
                   for providers_networks in json_object.get('entry')]

specialties = get_specialties(healthcareServiceUrls)
codes = [specialty[0] for specialty in specialties]
specialty_names = [specialty[1] for specialty in specialties]
names, addresses, numbers = get_names_addresses_and_numbers(location_urls)
coordinates = get_coordinates(addresses)
latitudes = [coordinate[0] for coordinate in coordinates]
longitudes = [coordinate[1] for coordinate in coordinates]
networks = extract_network_name(network_objects)

# Create dataframe
kaiser_providers = pd.DataFrame(data={'Name': names, 
                                      'Address': addresses,
                                      'Phone Number': numbers,
                                      'Latitude': latitudes, 
                                      'Longitude': longitudes,
                                      'Provider Taxonomy Code': codes,
                                      'Specialty': specialty_names,
                                      'Networks': networks})

next_dict = json_object.get('link')[1]
next_url = next_dict.get('url')
page += 1

# Repeat above to account for pagination
while page <= total_pages:
    next_json_object = requests.get(next_url).json()
    print(f'Working on Page {page}!')
    provider_list = next_json_object.get('entry')
    try:
        healthcareService_refs = get_healthcareService_refs(provider_list)
    except TypeError:
        page += 1
        continue
    healthcareServiceUrls = [base_url + ref for ref in healthcareService_refs]
    location_refs = [provider.get('resource').get('location')[0].get('reference')
                     for provider in provider_list]
    location_urls = [base_url + ref for ref in location_refs]
    network_objects = [providers_networks.get('resource').get('extension')[1:]
                       for providers_networks in next_json_object.get('entry')]
    
    specialties = get_specialties(healthcareServiceUrls)
    codes = [specialty[0] for specialty in specialties]
    specialty_names = [specialty[1] for specialty in specialties]
    names, addresses, numbers = get_names_addresses_and_numbers(location_urls)
    coordinates = get_coordinates(addresses)
    latitudes = [coordinate[0] for coordinate in coordinates]
    longitudes = [coordinate[1] for coordinate in coordinates]
    networks = extract_network_name(network_objects)

    kaiser_df = pd.DataFrame(data={'Name': names, 
                                   'Address': addresses,
                                   'Phone Number': numbers,
                                   'Latitude': latitudes, 
                                   'Longitude': longitudes,
                                   'Provider Taxonomy Code': codes,
                                   'Specialty': specialty_names,
                                   'Networks': networks})
    
    kaiser_providers = pd.concat([kaiser_providers, kaiser_df])

    next_dict = next_json_object.get('link')[1]
    next_url = next_dict.get('url')
    page += 1
    
# Fix index
kaiser_providers = kaiser_providers.reset_index() \
                                   .drop('index', axis=1)

# Add networks
kaiser_providers['Kaiser EPO Network'] = network_finder('EPO', kaiser_providers)
kaiser_providers['Kaiser HMO Network'] = network_finder('HMO', kaiser_providers)
kaiser_providers['Kaiser Medi-Cal Network'] = network_finder('Medi-Cal', kaiser_providers)
kaiser_providers['Kaiser Point-of-Service Network'] = network_finder('POS', kaiser_providers)
kaiser_providers['Kaiser Senior Advantage Network'] = network_finder('Medicare', kaiser_providers)

# Add state and zip code
cities = city_lookup(kaiser_providers)
state_zip = [re.findall(r'\w{2}\s\d+$', address) for address in kaiser_providers['Address'].values]
state_zip_split = [re.split(r'\s', state_w_zip[0]) for state_w_zip in state_zip]
states = [split[0] for split in state_zip_split]
zip_codes = [split[1] for split in state_zip_split]
kaiser_providers['City'] = cities
kaiser_providers['State'] = states
kaiser_providers['Zip Code'] = zip_codes

# Add carrier
kaiser_providers['Carrier'] = add_carriers(kaiser_providers)

# Add patient status
kaiser_providers['Accepting Patients'] = add_accepting_patients_status(kaiser_providers)

# Add ids
id_numbers = add_id(kaiser_providers)
kaiser_providers.insert(loc=0,
                        column='ID',
                        value=id_numbers)
                 
kaiser_providers.head()

Total Entries: 371
Entries per Page: 50
Working on Page 1!
Working on Page 2!
Working on Page 3!
Working on Page 4!
Unable to find coordinates for 4701 Stoddard Rd Modesto CA 95356
Working on Page 5!
Working on Page 6!
Working on Page 7!
Working on Page 8!


Unnamed: 0,ID,Name,Address,Phone Number,Latitude,Longitude,Provider Taxonomy Code,Specialty,Networks,Kaiser EPO Network,Kaiser HMO Network,Kaiser Medi-Cal Network,Kaiser Point-of-Service Network,Kaiser Senior Advantage Network,City,State,Zip Code,Carrier,Accepting Patients
0,KP1,"Nancy Rakela, Omd, Lac",2228 6TH St Berkeley CA 94710,(510) 540-6267,37.8642515,-122.2967957522431,171100000X,Acupuncturist,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94710,Kaiser,1
1,KP2,"Nancy Rakela, Omd, Lac",2228 6TH St Berkeley CA 94710,(510) 540-6267,37.8642515,-122.2967957522431,171100000X,Acupuncturist,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94710,Kaiser,1
2,KP3,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0804X,Child & Adolescent Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,1,1,1,Berkeley,CA,94704,Kaiser,1
3,KP4,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94704,Kaiser,1
4,KP5,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94704,Kaiser,1


In [4]:
providers = kaiser_providers.drop_duplicates(subset=['Name',
                                                     'Provider Taxonomy Code',
                                                     'Specialty',
                                                     'Carrier']) \
                                            .reset_index() \
                                            .drop('index', axis=1)

providers.to_csv('providers.csv')
providers.head()

Unnamed: 0,ID,Name,Address,Phone Number,Latitude,Longitude,Provider Taxonomy Code,Specialty,Networks,Kaiser EPO Network,Kaiser HMO Network,Kaiser Medi-Cal Network,Kaiser Point-of-Service Network,Kaiser Senior Advantage Network,City,State,Zip Code,Carrier,Accepting Patients
0,KP1,"Nancy Rakela, Omd, Lac",2228 6TH St Berkeley CA 94710,(510) 540-6267,37.8642515,-122.2967957522431,171100000X,Acupuncturist,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94710,Kaiser,1
1,KP3,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0804X,Child & Adolescent Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,1,1,1,Berkeley,CA,94704,Kaiser,1
2,KP4,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94704,Kaiser,1
3,KP10,Bay Psychiatric Associates - Herrick Campus,2001 Dwight Way Ste 4190 Berkeley CA 94704,(510) 204-4635,37.8630899,-122.270353,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,1,1,1,Berkeley,CA,94704,Kaiser,1
4,XX18,Bay Psychiatric Associates - Herrick Campus,2001 Dwight Way Ste 4190 Berkeley CA 94704,(510) 204-4635,37.8630899,-122.270353,2084P0800X,Psychiatry Physician,[Not accepting patients],0,0,0,0,0,Berkeley,CA,94704,,0


In [5]:
geo_providers = gpd.GeoDataFrame(providers,
                                 geometry=gpd.points_from_xy(providers.Longitude,
                                                             providers.Latitude)) \
                   .dropna(subset=['Latitude', 'Longitude']) \
                   .reset_index() \
                   .drop('index', axis=1)
geo_providers.head()

Unnamed: 0,ID,Name,Address,Phone Number,Latitude,Longitude,Provider Taxonomy Code,Specialty,Networks,Kaiser EPO Network,Kaiser HMO Network,Kaiser Medi-Cal Network,Kaiser Point-of-Service Network,Kaiser Senior Advantage Network,City,State,Zip Code,Carrier,Accepting Patients,geometry
0,KP1,"Nancy Rakela, Omd, Lac",2228 6TH St Berkeley CA 94710,(510) 540-6267,37.8642515,-122.2967957522431,171100000X,Acupuncturist,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94710,Kaiser,1,POINT (-122.29680 37.86425)
1,KP3,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0804X,Child & Adolescent Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,1,1,1,Berkeley,CA,94704,Kaiser,1,POINT (-122.27081 37.87123)
2,KP4,Bay Psychiatric Associates - Berkeley,2020 Milvia St Fl 3 Berkeley CA 94704,(510) 809-1599,37.87122546511628,-122.27080786046513,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,0,1,1,Berkeley,CA,94704,Kaiser,1,POINT (-122.27081 37.87123)
3,KP10,Bay Psychiatric Associates - Herrick Campus,2001 Dwight Way Ste 4190 Berkeley CA 94704,(510) 204-4635,37.8630899,-122.270353,2084P0800X,Psychiatry Physician,"[Kaiser EPO Network, Kaiser HMO Network, Kaise...",1,1,1,1,1,Berkeley,CA,94704,Kaiser,1,POINT (-122.27035 37.86309)
4,XX18,Bay Psychiatric Associates - Herrick Campus,2001 Dwight Way Ste 4190 Berkeley CA 94704,(510) 204-4635,37.8630899,-122.270353,2084P0800X,Psychiatry Physician,[Not accepting patients],0,0,0,0,0,Berkeley,CA,94704,,0,POINT (-122.27035 37.86309)


In [6]:
uc_location = (37.8719, -122.2585)
base_map = folium.Map(location=uc_location,
                      zoom_start=14)

kaiser_berkeley = geo_providers.explore(m=base_map,
                                        marker_type='marker')
kaiser_berkeley

In [7]:
kaiser_berkeley.save('kaiser_test.html')