<a href="https://colab.research.google.com/github/cobydodson/python2/blob/main/Python_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Physician Recommender for Carnegie Mellon University Health Insurance Users

In [None]:
'''------Setting up PCP Directory-------'''
#NPPES NPI Registry API
import requests
import pandas as pd
import json

#List of zipcodes
zipcodes = [15213, 15261, 15232, 15224, 15217, 15219]
base_url = 'https://npiregistry.cms.hhs.gov/api/?number=&enumeration_type=&taxonomy_description={}&name_purpose=&first_name=&use_first_name_alias=&last_name=&organization_name=&address_purpose=LOCATION&city=Pittsburgh&state=PA&postal_code={}&country_code=US&limit=200&skip={}&pretty=on&version=2.1'

# List to store all provider data
all_providers = []

#This function expands the address to the full word
def expand_address(address):
    abbreviations = {
        'ST': 'STREET',
        'AVE': 'AVENUE',
        'BLVD': 'BOULEVARD',
        'RD': 'ROAD',
        'DR': 'DRIVE',
        'LN': 'LANE',
        'CT': 'COURT',
        'PL': 'PLACE',
        'SQ': 'SQUARE',
        'PKWY': 'PARKWAY'
    }

    # Iterate over addresses to look for abbreviations
    address = address.replace('.', '').strip().upper()

    # Iterate over abbreviations and check if the abbreviation with a preceding space is in the address
    for abbr, full_word in abbreviations.items():
        # Finds the abbreviation with a space before and after it to account for other words that may have the abbreviation
        if f" {abbr} " in address or address.endswith(f" {abbr}"):
            #the abbreviation and replace it with the full word
            abbr_pos = address.find(f" {abbr}")
            # Keeps everything up to the end of the abbreviation, and changes the abbreviation to the full word
            address = address[:abbr_pos + len(abbr) + 1]
            address = address.replace(f" {abbr}", f" {full_word}")
            break

    # Return the address with the expanded abbreviation
    return address.upper()

#List of taxonomies
target_taxonomies = ["Family Medicine", "Primary Care", "Internal Medicine"]

# This loops through each taxonomy
for taxonomy in target_taxonomies:
    # This loops through each zip code
    for zipcode in zipcodes:
        #Accounts for multiple pages of providers under the same taxonomy, the limit of the api is 200 results
        skip = 0
        limit = 200

        while True:
            # Format the URL with the current taxonomy and zipcode, and account for the skip
            url = base_url.format(taxonomy, zipcode, skip)
            try:
                response = requests.get(url)
                response.raise_for_status()

                data = response.json()

                # Results is the overarching table in the api, pulls the basic provider information
                if 'results' in data and data['results']:
                    for provider in data['results']:
                        provider_info = {
                            "NPI Number": provider.get('number', ''),
                            "First Name": provider.get('basic', {}).get('first_name', 'N/A'),
                            "Last Name": provider.get('basic', {}).get('last_name', 'N/A'),
                            "Credential": provider.get('basic', {}).get('credential', 'N/A'),
                        }

                        #Sort the information by the first 5 digits in the zipcode, gathering all of them but shortening it to 5
                        for address in provider.get('addresses', []):
                            full_zipcode = address.get('postal_code', '')
                            five_digits = full_zipcode[:5]

                            # Updates the list with taxonomy information from a different table than basic information
                            provider_info.update({
                                "Address": address.get('address_1', ''),
                                "City": address.get('city', ''),
                                "State": address.get('state', ''),
                                "Telephone Number": address.get('telephone_number', ''),
                                "Zipcode": five_digits  # Store the 5-digit zip code
                            })

                        #This updates the list with taxonomy information
                        provider_info["Taxonomy Description"] = taxonomy

                        all_providers.append(provider_info)

                    #This will stop the iteration when all of the data is gathered
                    if len(data['results']) < limit:
                        break

                    #Skip the already iterated providers
                    skip += limit
                else:
                    # Break the loop if no more results are returned
                    break

            except requests.exceptions.RequestException as e:
                break

#Creates a dataframe for the data
provider_table = pd.DataFrame(all_providers)

#Drop the duplicates in the data
provider_table = provider_table.drop_duplicates()

#Applies the expand address function to account for abbreviations
provider_table['Address'] = provider_table['Address'].apply(expand_address)

#Filter again only for providers in Pittsburgh, PA
provider_table = provider_table[provider_table['State'] == 'PA']
provider_table = provider_table[provider_table['City'] == 'PITTSBURGH']

# Reset index after removing duplicates
provider_table = provider_table.reset_index(drop=True)

# Print the final DataFrame
print(provider_table)

'''-----Matching PCP directory with SHIP PCP directory -------'''
PDindex1 = pd.read_csv('PD2024.csv')

# Creating lists for DO and MD doctors, filtering out hospital centers and other miscellaneous places
do = []
md = []
rando = []

for index in range(PDindex1.shape[0]):
   if "DO" in PDindex1.loc[index,'Provider']:
       do.append(PDindex1.loc[index,'Provider'])

   elif "MD" in PDindex1.loc[index,'Provider']:
       md.append(PDindex1.loc[index,'Provider'])

   else:
       rando.append(PDindex1.loc[index,'Provider'])

#Creating neater data, splitting the list strings to get first and last names and degree type
full_directory = []
rando1 = []
rando2 = []

for i in do:
    inde = i.split(',')
    if len(inde) > 2:
        #index = inde[1] + ' ' + inde[0]
        full_directory.append([(inde[1].lstrip()).upper(), (inde[0].lstrip()).upper(), 'DO'])
    else:
        rando1.append(i)

for i in md:
    inde = i.split(',')
    if len(inde) > 2:
        #index = inde[1] + ' ' + inde[0]
        full_directory.append([(inde[1].lstrip()).upper(), (inde[0].lstrip()).upper(), 'MD'])
    else:
        rando2.append(i)

# Creating dataframe with PPO Blue providers information and sorting it alphabetically by first name
ppoblue_providers = pd.DataFrame(full_directory, columns =['First Name', 'Last Name', 'Medical Degree'])

ppoblue_providers.sort_values(by = 'First Name', inplace =True)
ppoblue_providers = ppoblue_providers.reset_index(drop=True)

#Determing with providers are PPO Blue providers from full provider_table from NPI Registry API directory
matched = provider_table.copy()
matched['In Network?'] = ['No'] * matched.shape[0]

for i in range(ppoblue_providers.shape[0]):
    for j in range(provider_table.shape[0]):
        if ((provider_table.loc[j]['First Name']) in (ppoblue_providers.loc[i]['First Name'])) and (provider_table.loc[j]['Last Name'] in ppoblue_providers.loc[i]['Last Name']):
            matched.loc[i, 'In Network?'] = 'Yes'

# Dropping any NaN or 'N/A' values from the the matched providers DataFrame
matched.dropna(inplace=True)
matched = matched[matched['First Name'] != 'N/A']
matched = matched.reset_index(drop=True)


     NPI Number First Name   Last Name Credential                Address  \
0    1194578922       KARI       AMMON       CRNP       600 GRANT STREET   
1    1528212271     KELLEY    ANDERSON         DO        3471 5TH AVENUE   
2    1477513364  CHRISTINE     ANDREWS         MD   1060 MOREWOOD AVENUE   
3    1538662325     SHAUNA  ASSADZANDI       M.D.     3600 FORBES AVENUE   
4    1497337950     ANGELA     BALARIN         MD      300 HALKET STREET   
..          ...        ...         ...        ...                    ...   
905  1144427576   ADRIANNA    WEGRECKI         MD     1515 LOCUST STREET   
906  1760610059     LAUREN     WILLARD       D.O.     1400 LOCUST STREET   
907  1578794244       JOHN       WOHAR       D.O.  607 GETTYSBURG STREET   
908  1508181710      SUMAN       YADAM         DO     490 E NORTH AVENUE   
909  1982288775      YUHAO        ZENG         MD       600 GRANT STREET   

           City State Telephone Number Zipcode Taxonomy Description  
0    PITTSBURGH  

In [None]:
'''---Setting up medical specialty dictionary----'''
# installed python libraries - pandas for pre-processing, requests and bs4 to scrap additional details for dictionary from reliable healthcare websites
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Scrape medical specialty table from wikipedia and convert to dataframe

url = 'https://en.wikipedia.org/wiki/Medical_specialty#:~:text=A%20medical%20specialty%20is%20a%20branch%20of%20medical%20practice%20that'
tables = pd.read_html(url)
spl_doc = pd.DataFrame(tables[5])

#replace Nan values with the word skip
spl_doc.fillna('Skip',inplace=True)

#Scraped data from reliable websites to fill missing data  to Focus column of dictionary
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
I_URL = 'https://www.webmd.com/a-to-z-guides/what-is-internist'
internal = requests.get(I_URL, headers=headers)
soup = BeautifulSoup(internal.content, 'html.parser')
para = soup.find_all('p')
i_para = para[1].text
spl_doc.at[20,'Focus'] = str(i_para)

H_URL = 'https://www.webmd.com/a-to-z-guides/what-is-a-hospitalist-doctor'
hospitalist = requests.get(H_URL, headers=headers)
soup = BeautifulSoup(hospitalist.content, 'html.parser')
para = soup.find_all('p')
h_para = para[1].text
spl_doc.at[17,'Focus'] = str(h_para)

O_URL = 'https://www.healthline.com/find-care/articles/obgyns/what-is-an-obgyn'
obgyn = requests.get(O_URL, headers=headers)
soup = BeautifulSoup(obgyn.content, 'html.parser')
para = soup.find_all('p')
o_para = para[0].text
spl_doc.at[25,'Focus'] = str(o_para)

B_URL = 'https://www.hopkinsmedicine.org/health/conditions-and-diseases/obesity/doctors-who-specialize-in-obesity'
bariatrics = requests.get(B_URL, headers=headers)
soup = BeautifulSoup(bariatrics.content, 'html.parser')
para = soup.find_all('p')
b_para = para[1].text
spl_doc.at[2,'Focus'] = str(b_para)

G_URL = 'https://my.clevelandclinic.org/health/articles/general-surgeon'
general_sur = requests.get(G_URL, headers=headers)
soup = BeautifulSoup(general_sur.content, 'html.parser')
para = soup.find_all('p')
g_para = para[0].text
spl_doc.at[13,'Focus'] = str(g_para)

# Adding keywords manually to existing descriptions in the Focus column of dictionary

spl_doc.at[3,'Focus'] = str(spl_doc.at[3,'Focus'])+'. Cardiovascular system includes heart and blood vessels.'
spl_doc.at[23,'Focus'] = str(spl_doc.at[23,'Focus'])+". CNS includes brain and spine."
spl_doc.at[24,'Focus'] = str(spl_doc.at[23,'Focus'])+". CNS includes brain and spine."


In [None]:
'''------user interface-------'''
#seek user input on ailment
print("Hello, I am the physician recommender!")
ailment_input = str(input("Which kind of issues are you having? (\n#1 Try to be general - E.g.: skin, allergy, kidney, pain\n #2 Add only 1 ailment) :"))
print()

# create a loop to look for ailment in Focus and sub-specialty columns of the dictionary and return relevant("specialists") cells in matching rows
print("You should see a doctor specializing in:\n")

found = False
for index, row in spl_doc.iterrows():
    if ailment_input.lower() in row['Focus'].lower():  # Check if the ailment_input is in the Focus column
        print(row['Specialty'])
        found = True
    elif ailment_input.lower() in row['Sub-specialties'].lower():  # Check if the ailment_input is in the Focus column
        print(row['Specialty'])
        found = True

print('')
print('Ok, now that you know what sort of doctor you need to see, I can help find a general doctor near you, who can refer you to any further specialists.')
print('')

#seek street adress from user
user_street = input("Please enter your street address (E.g. 5000 Forbes Avenue): ").lower()

#seek zipcode from user through function to ensure loop breaks after an eligible value (5 digit zipcode) is entered
def get_zipcode():
  while True:
        try:
            user_zip = str(input("Now enter your 5-digit ZIP code: "))
            if len(user_zip) != 5 or not user_zip.isdigit():
                raise ValueError("Invalid ZIP code. Please enter exactly 5 digits.")
            return user_zip  # Breaks the loop on valid input
        except ValueError as e:
              print(e)

zip = get_zipcode()

Hello, I am the physician recommender!
Which kind of issues are you having? (
#1 Try to be general - E.g.: skin, allergy, kidney, pain
 #2 Add only 1 ailment) :heart

You should see a doctor specializing in:

Cardiology
Cardiovascular surgery
Palliative care
Thoracic surgery

Ok, now that you know what sort of doctor you need to see, I can help find a general doctor near you, who can refer you to any further specialists.

Please enter your street address (E.g. 5000 Forbes Avenue): 5000 Forbes Avenue
Now enter your 5-digit ZIP code: 15213


In [None]:
'''----Code for Location Mapping----'''

# Installed geopy library from Anaconda environments
# Importing Nominatim from geopy.geocoders to connects to OpenStreetMap and use their location data
# Importing geodesic from geopy.distance to calculate distance between 2 locations with latitude & longitude coordinates
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

# Connecting to OpenStreetMap with random user_agent id
geolocator = Nominatim(user_agent="my-request21")

#Getting user input address into correct format to pull latitude and longitude coordinates
user_address = geolocator.geocode(user_street + ', Pittsburgh, PA, ' + zip + ' USA')

lat1 = str(user_address.latitude)
long1 = str(user_address.longitude)
user_address_coords = (user_address.latitude, user_address.longitude)

usermatched_full_name = []
usermatched_addresses = []
usermatched_driving = []
usermatched_walking = []
usermatched_contact = []
usermatched_taxonomy = []
failures = []

for i in range(matched[matched['In Network?'] == 'Yes'].shape[0]):
  #Putting code in a try block to catch any provider addresses that may cause errors without stopping code
    try:
      #converting Provider address to usable address and getting latitude and longitude coordinates
        locationp = geolocator.geocode((str(matched.loc[i]['Address']) +
                                            ' Pittsburgh, Pennsylvania,'+ str(matched.loc[i]['Zipcode'])+' USA'))

        latp = str(locationp.latitude)
        longp = str(locationp.longitude)
        addressp = (locationp.latitude, locationp.longitude)

# Determining if provider address is within ___ miles of user address
        if round(geodesic(user_address_coords, addressp).miles, 2) <= 1.00:
            usermatched_addresses.append(matched.loc[i]['Address'])

# Connecting to Tom Tom Routing API to calculate driving and walking times between user and approved provider addresses
            url = ('https://api.tomtom.com/routing/1/calculateRoute/'+lat1+'%2C'+
                       long1+'%3A'+latp+'%2C'+longp+'/json?key=AW7niDDGA3mTkAd99IuoLuOwBOBGrjlS')

            url1 = ('https://api.tomtom.com/routing/1/calculateRoute/'+lat1+'%2C'+
                        long1+'%3A'+latp+'%2C'+longp+
                        '/json?travelMode=pedestrian&key=AW7niDDGA3mTkAd99IuoLuOwBOBGrjlS')

            response = requests.get(url)
            response1 = requests.get(url1)

# Adding provider details and traveling times to appropriate "user_matched" lists
            if response.status_code == 200:
                data = json.loads(response.content.decode('utf-8'))
                usermatched_full_name.append(matched.loc[i]['First Name'] +
                                              ' ' + matched.loc[i]['Last Name'])
                usermatched_contact.append(matched.loc[i]['Telephone Number'])
                usermatched_taxonomy.append(matched.loc[i]['Taxonomy Description'])
                usermatched_driving.append(round((data['routes'][0]['summary']['travelTimeInSeconds'])/60, 2))

            if response1.status_code == 200:
                data1 = json.loads(response1.content.decode('utf-8'))
                usermatched_walking.append(round((data1['routes'][0]['summary']['travelTimeInSeconds'])/60, 2))

    except:
        failures.append((str(matched.loc[i]['Address']) +
                         ' Pittsburgh, Pennsylvania,'+ str(matched.loc[i]['Zipcode'])+' USA'))

# Converting lists of information into one DataFrame for easier viewing for user
user_matches = pd.DataFrame({'Full Name':usermatched_full_name,
                             'Address':usermatched_addresses,
                             'Driving Time': usermatched_driving,
                             'Walking Time (minutes)': usermatched_walking,
                             'Telephone Number': usermatched_contact,
                             'Specialty': usermatched_taxonomy})

#Sorting user matched providers by walking time for closest providers are first
user_matches.sort_values(by = 'Walking Time (minutes)', inplace =True)

print(user_matches)



                  First Name                  Address  Driving Time  \
1          CHRISTINE ANDREWS     1060 MOREWOOD AVENUE          4.80   
16             PANKAJ THAKUR        3471 FIFTH AVENUE          3.92   
0            KELLEY ANDERSON          3471 5TH AVENUE          3.90   
14            WILLIAM RIVERS          3601 5TH AVENUE          3.90   
10                SIMIN NASR          3471 5TH AVENUE          3.90   
13          ELIZABETH PORTER          3471 5TH AVENUE          3.90   
12             RUSSELL PIPER          3708 5TH AVENUE          3.90   
7            JOELY HEIDINGER  100 N BELLEFIELD AVENUE          4.40   
3             MICHAEL BERGAL        120 LYTTON AVENUE          5.15   
17           CHRISTINA YURKO        120 LYTTON AVENUE          5.13   
20  MARY ANNE APOSTOL CASTRO       5215 CENTRE AVENUE          6.53   
19            HAMID AKBARIAN       5215 CENTRE AVENUE          6.53   
18       ANA AGUILAR CORDOVA       5215 CENTRE AVENUE          6.53   
22    