In [1]:
# Run this cell!
import pandas as pd
import sys
import os.path
import requests
import concurrent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from threading import Thread
from multiprocessing.pool import ThreadPool
import numpy as np
import json
import re
import geopandas as gpd
import folium
import dask.dataframe as dd

# Anthem Test Notebook

In [None]:
# PLEASE DON'T RUN THIS CELL (USE PYTHON SCRIPT)
# Determine carrier from commmand line
carrier = 'uhc'
if carrier == 'uhc':
    url = 'https://public.fhir.flex.optum.com/R4/Location?_count=100&address-state=CA'
    add_on = ''
elif carrier == 'anthem':
    url = 'https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?_count=100&address-state=CA'
    add_on = '&address-state=CA'

# Check if file already exists
file_existence = os.path.isfile(f'ca_{carrier}_providers.csv')
if file_existence:
    sys.exit('File already exists! Delete file and run script again')

# Pull data from carrier API
session = requests.Session()
retry = Retry(connect=5, backoff_factor=2)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
request = session.get(url)
json_object = request.json()

# API Calls
providers = pd.json_normalize(json_object.get('entry'))
print('Initial Request Successful! Hang tight this will take quite some time...')
page_number = 1

while True:
    try:
        if page_number % 100 == 0:
            print(f'Completed: {page_number}')
        next_url = json_object.get('link')[1].get('url') + add_on
        next_request = session.get(next_url)
        json_object = next_request.json()
        providers = pd.concat([providers, pd.json_normalize(json_object.get('entry'))])
        providers.to_csv(f'ca_{carrier}_providers.csv', mode='a', index=False)
        page_number += 1
    except Exception as e:
        print(e)
        print(f'Total Number of Pages: {page_number}')
        print(f'DataFrame Shape: {providers.shape}')
        print('Done!')
        break
        
providers
#providers.to_csv(f'ca_{carrier}_providers.csv', index=False)

In [29]:
range(1, 11)[-1]

10

In [4]:
# Let's try some spooky shit frfr
carrier = 'anthem'
if carrier == 'uhc':
    url = 'https://public.fhir.flex.optum.com/R4/Location?_count=100&address-state=CA'
    add_on = ''
elif carrier == 'anthem':
    url = 'https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?address-state=CA'
    add_on = '&address-state=CA'

# Check if file already exists
file_existence = os.path.isfile(f'ca_{carrier}_test_providers.csv')
if file_existence:
    sys.exit('File already exists! Delete file and run script again')

# Pull data from carrier API

#session = requests.Session()
#retry = Retry(connect=5, backoff_factor=2)
#adapter = HTTPAdapter(max_retries=retry)
#session.mount('http://', adapter)
#session.mount('https://', adapter)
request = requests.get(url)
json_object = request.json()
providers = pd.json_normalize(json_object.get('entry'))
base = json_object.get('link')[1].get('url')[:-30]
total_pages = 50 #int(re.findall('totalPages=(\d+)', json_object['link'][1].get('url'))[0])
total_queries = range(1, total_pages + 1)

json_list = []

# API Calls
def do_one_request(current_page):
    provider_copy = providers.copy()
    full_url = f'{url}&pageNumber={current_page}&totalPages={total_pages}'
    print(f'{requests.get(full_url)}')
    response = requests.get(full_url, headers={'Retry-After':0.1}).json()
    result = response.get('entry')
    return result
    
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    completed = executor.map(do_one_request, total_queries)
json_list.extend(completed)

print(f'{len(json_list)}')
for item in json_list:
    providers = pd.concat([providers, pd.json_normalize(item)])
    
providers.head()

<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [200]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [200]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [503]>
<Response [200]>
<Response [503]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


InvalidHeader: Value for header {Retry-After: 0.1} must be of type str or bytes, not <class 'float'>

## Data Wrangling
Everyone's favorite part of Data Science XD

In [7]:
test = requests.get('https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?address-state=CA').json()
test.keys()

dict_keys(['resourceType', 'type', 'id', 'meta', 'timestamp', 'link', 'entry'])

In [12]:
test['link'][1].get('url')

'https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?address-state=CA&pageNumber=2&totalPages=27156'

In [53]:
test['entry']

10

In [None]:
HealthcareService?service-category=prov&location.address-state=CA

In [2]:
# Assortment of user-defined data wrangling functions
def beautify_number(number):
    '''
    Helper function that takes a string of length 10 (standard
    length of US phone numbers) and converts it to standard
    US phone number format: (###) ###-####
    -----
    Input:
    
    number (str) - String of length 10 to convert
    -----
    Output:
    
    beautified_number (str) - Converted string
    '''
    area_code = number[:3]
    next_three = number[3:6]
    last_four = number[6:]
    beautified_number = f'({area_code}) {next_three}-{last_four}'
    return beautified_number

def extract_phone_fax(dictionary):
    '''
    Data wrangling/conversion helper function that takes a list of
    Python dictionaries (named dictionary in the str_to_dict 
    function) and creates a two-column DataFrame df with a 
    column for Phone number and Fax number (None if not provided)
    -----
    Input:
    
    dictionary (List) - Python List of dictionaries, each item in
                        dictionary will be of length 1 (if just 
                        phone is available) or length 2 (if both
                        phone and fax is available)
    -----
    Output:
    
    df (DataFrame) - Pandas DataFrame object with two columns:
                        **Phone** | **Fax**
                             x    |    x
                             x    |   None
                                 ...
    '''
    # Code is repeated for the sake of readability/interpretability!
    phone = []
    fax = []
    for i in np.arange(0, len(dictionary)):
        item = dictionary[i]
        # Only phone number available
        if len(item) == 1:
            phone_dict = item[0]
            phone_number_raw = phone_dict.get('value')
            phone_number = beautify_number(phone_number_raw) # Call helper
            phone.append(phone_number)
            fax.append(None)
        # If both phone number and fax is available
        else:
            phone_dict = item[0]
            fax_dict = item[1]
            phone_number_raw = phone_dict.get('value')
            fax_number_raw = fax_dict.get('value')
            phone_number = beautify_number(phone_number_raw) # Call helper
            fax_number = beautify_number(fax_number_raw) # Call helper
            phone.append(phone_number)
            fax.append(fax_number)
    df = pd.DataFrame(data={'Phone Number':phone, 'Fax Number': fax})
    return df

def extract_contact_info(arr):
    '''
    Data wrangling/conversion function to take str objects parsed
    from Anthem API calls and creates a two-column DataFrame df with a 
    column for Phone number and Fax number (None if not provided)
    -----
    Input:
    
    arr (NumPy Array) - NumPy Array object of same length as the rows of
                        anthem DataFrame with each item in arr corresponding
                        to a string representation of the telecom dictionary
                        pulled from Anthem's database
    -----
    Intermediaries:
    
    dictionary (List) - Python List object containing Python dictionaries

    -----
    Output:
    df (DataFrame) - Pandas DataFrame object with two columns:
                        **Phone** | **Fax**
                             x    |    x
                             x    |   None
                                 ...
    '''
    dictionary = []
    for i in np.arange(0, len(arr)):
        string = arr[i]
        
        # Convert single quotes to double quotes for json.load() &
        # Remove [] to "delist-ify" strings
        dict_str = re.sub("'", '"', string[1:-1])

        # Extract all strings that form a dictionary from original string
        extracted = re.findall('\{[\w\s,:"]+\}', dict_str) # extracted is a list!

        # Create a nested list of Python dictionaries
        dictionary.append([json.loads(string) for string in extracted])

    # Create two-column DataFrame
    df = extract_phone_fax(dictionary) # Call helper function
    
    return df

In [5]:
anthem_raw = dd.read_csv('ca_anthem_providers.csv', dtype={'_Loc_ID': int,
                                                           '_id': int,
                                                           'address.postalCode': int,
                                                           'id': int,
                                                           'position.latitude': float,
                                                           'position.longitude': float}) \
               .query('telecom != "telecom"')
anthem = anthem_raw.drop(labels=['_id', 'resourceType',
                                '_Loc_ID', 'extension',
                                'identifier', 'status',
                                'telecom', 'meta.lastUpdated', 
                                'meta.profile', 'address.use', 
                                'address.type', 'address.line',
                                'address.district', 'address.country'],
                        axis=1)
#contact_info = extract_contact_info(anthem_raw['telecom'].values)
#anthem['Phone Number'] = contact_info['Phone Number']
#anthem['Fax Number'] = contact_info['Fax Number']
#anthem.head()
anthem.head()

ValueError: invalid literal for int() with base 10: '_id'

In [None]:
anthem = anthem.rename(columns={'id':'ID', 'name':'Name',
                                'address.text':'Address', 'address.city':'City',
                                'address.state':'State', 'address.postalCode':'Zip Code',
                                'position.longitude':'Longitude', 'position.latitude':'Latitude'})
anthem = anthem[['Name', 'Address', 'Phone Number', 'Fax Number',
                 'City', 'Zip Code', 'State', 'ID',
                 'Latitude', 'Longitude']]
anthem.head()

In [None]:
geo_anthem = gpd.GeoDataFrame(anthem, geometry=gpd.points_from_xy(anthem.Longitude, 
                                                                  anthem.Latitude))
geo_anthem.head()

In [None]:
anthem_map = geo_anthem.explore(m=folium.Map(location=[37.8719, -122.2585],
                                             tiles='OpenStreetMap', 
                                             zoom_start=10))
anthem_map

In [None]:
anthem_map.save('anthem_test_map.html')

In [13]:
request_test = requests.get('https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location')
request_test.json()

{'resourceType': 'Bundle',
 'type': 'searchset',
 'id': '515111b4-1c44-4c16-a183-91c5230155a4',
 'meta': {'lastUpdated': '2022-11-17T00:15:07.545-0500',
  'profile': ['http://hl7.org/fhir/us/davinci-pdex-plan-net/StructureDefinition/plannet-Location']},
 'timestamp': '2022-11-17T00:15:07.545-0500',
 'link': [{'relation': 'self',
   'url': 'https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?&pageNumber=1'},
  {'relation': 'next',
   'url': 'https://cmsmanapi.anthem.com/fhir/cms_mandate/mcd/Location?&pageNumber=2&totalPages=177231'},
  {'relation': 'Previous', 'url': ''}],
 'entry': [{'_id': '10001',
   'resourceType': 'Location',
   '_Loc_ID': '1',
   'id': '10001',
   'meta': {'lastUpdated': '2022-11-15T14:30:15+00:00',
    'profile': ['http://hl7.org/fhir/us/davinci-pdex-plan-net/StructureDefinition/plannet-Location']},
   'extension': [{'url': 'http://hl7.org/fhir/us/davinci-pdex-plan-net/StructureDefinition/accessibility',
     'valueCodeableConcept': {'coding': [{'system': 