In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import time
import pandas as pd
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import re

from selenium.webdriver.common.action_chains import ActionChains

In [5]:
# Perform the initial Search
driver = webdriver.Chrome()
link = 'https://npiregistry.cms.hhs.gov/search'
driver.get(link)

# create variable to wait for page to load
wait = WebDriverWait(driver, 10)

In [7]:
# select NPI Type: Organization
#<label _ngcontent-ng-c1663790668="" for="enumerationType">NPI Type</label>

npi_dropdown = wait.until(EC.presence_of_element_located((By.ID, "enumerationType")))
Select(npi_dropdown).select_by_visible_text("Organization")

In [8]:
# insert Taxonomy Description: hospice
# <input _ngcontent-ng-c1663790668="" type="text" id="taxonomyDescription" formcontrolname="taxonomyDescription" 
# name="taxonomyDescription" class="form-control ng-untouched ng-pristine ng-valid">

hospice_text_input = wait.until(EC.presence_of_element_located((By.ID, "taxonomyDescription")))
#hospice_text_input.clear()
hospice_text_input.send_keys("hospice")


In [9]:
# select State: COLORADO
# <label _ngcontent-ng-c1663790668="" for="state">State</label>

state_dropdown = wait.until(EC.presence_of_element_located((By.ID, "state")))
Select(state_dropdown).select_by_visible_text("COLORADO")

In [10]:
# click Search button
# <button _ngcontent-ng-c1663790668="" type="submit" name="search" class="btn btn-primary active">Search</button>

search_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@name='search']")))
search_button.click()

In [11]:
# Wait for results table to appear after clicking Search
try:
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr")))
except TimeoutException:
    print("Timed out waiting for search results to load.")
    driver.quit()
    exit()

data = []

while True:
    time.sleep(1.5)

    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) >= 6:
            npi = cells[0].text.strip()
            name = cells[1].text.strip()
            npi_type = cells[2].get_attribute('innerText').strip()
            address = cells[3].text.strip().replace("\n", ", ")
            taxonomy = cells[5].text.strip()
            data.append([npi, name, npi_type, address, taxonomy])

    # Find MORE button
    try:
        more_button = wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//button[contains(@class, "btn-outline-primary") and contains(text(), "More")]')))
        driver.execute_script("arguments[0].click();", more_button) 
    except TimeoutException:
        print("No more pages found or timed out.")
        break
    except NoSuchElementException:
        print("Reached last page.")
        break

# Save data into df
df = pd.DataFrame(data, columns=["NPI", "Name", "NPI Type", "Primary Practice Address", "Primary Taxonomy"])
driver.quit()

No more pages found or timed out.


In [None]:
df

Unnamed: 0,NPI,Name,NPI Type,Primary Practice Address,Primary Taxonomy
0,1760093470,247 HOME HEALTH CARE LTD,,"8055 E TUFTS AVE STE 250, DENVER, CO 80237-2857","Hospice Care, Community Based"
1,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",,"2851 S PARKER RD STE 1130, AURORA, CO 80014-2732","Hospice Care, Community Based"
2,1861097982,"ABODE HEALTHCARE COLORADO, INC",,"1050 EAGLERIDGE BLVD, PUEBLO, CO 81008-2130","Hospice Care, Community Based"
3,1326459025,"ABODE HEALTHCARE COLORADO, INC",,"5465 MARK DABLING BLVD, COLORADO SPRINGS, CO 8...","Hospice Care, Community Based"
4,1447611512,"ABODE HEALTHCARE COLORADO, INC.",,"744 HORIZON CT STE 110, GRAND JUNCTION, CO 815...","Hospice Care, Community Based"
...,...,...,...,...,...
296,1194839001,WYOMING HOME HEALTH INC,,"1103 E BOXELDER RD STE JB, GILLETTE, WY 82718-...",Home Health
297,1013648583,YNA HOSPICE INC,,"3190 S VAUGHN WAY STE 550 OFF 520, AURORA, CO ...","Hospice Care, Community Based"
298,1285645382,YULIYA GOSTISHCHEVA,,"1240 S PARKER RD, DENVER, CO 80231-7558","Hospice Care, Community Based"
299,1942931415,ZA HOSPICE INC,,"102 S TEJON ST STE 1100 OFF 1111, COLORADO SPR...","Hospice Care, Community Based"


In [None]:
# 9 DIGIT ZIP (ZIP9)
# save 9 digit zip in a separate field from Primary Practice Address
# XXXXX-XXXX
# Note: 25/301 addresses only have zip code so, will need to get only zip code

df['ZIP9'] = df['Primary Practice Address'].str.extract(r'(\d{5}(?:-\d{4})?)$')
df

Unnamed: 0,NPI,Name,NPI Type,Primary Practice Address,Primary Taxonomy,ZIP9
0,1760093470,247 HOME HEALTH CARE LTD,,"8055 E TUFTS AVE STE 250, DENVER, CO 80237-2857","Hospice Care, Community Based",80237-2857
1,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",,"2851 S PARKER RD STE 1130, AURORA, CO 80014-2732","Hospice Care, Community Based",80014-2732
2,1861097982,"ABODE HEALTHCARE COLORADO, INC",,"1050 EAGLERIDGE BLVD, PUEBLO, CO 81008-2130","Hospice Care, Community Based",81008-2130
3,1326459025,"ABODE HEALTHCARE COLORADO, INC",,"5465 MARK DABLING BLVD, COLORADO SPRINGS, CO 8...","Hospice Care, Community Based",80918-3842
4,1447611512,"ABODE HEALTHCARE COLORADO, INC.",,"744 HORIZON CT STE 110, GRAND JUNCTION, CO 815...","Hospice Care, Community Based",81506-3915
...,...,...,...,...,...,...
296,1194839001,WYOMING HOME HEALTH INC,,"1103 E BOXELDER RD STE JB, GILLETTE, WY 82718-...",Home Health,82718-5557
297,1013648583,YNA HOSPICE INC,,"3190 S VAUGHN WAY STE 550 OFF 520, AURORA, CO ...","Hospice Care, Community Based",80014
298,1285645382,YULIYA GOSTISHCHEVA,,"1240 S PARKER RD, DENVER, CO 80231-7558","Hospice Care, Community Based",80231-7558
299,1942931415,ZA HOSPICE INC,,"102 S TEJON ST STE 1100 OFF 1111, COLORADO SPR...","Hospice Care, Community Based",80903


In [12]:
from selenium.webdriver.common.action_chains import ActionChains

data = []

while True:
    time.sleep(2)  # allow page to fully render

    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")

    for i in range(len(rows)):
        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")  # regrab after reload
        npi_link = rows[i].find_element(By.TAG_NAME, "button")
        driver.execute_script("arguments[0].scrollIntoView(true);", npi_link)
        time.sleep(0.5)
        npi_link.click()

        # wait for Status to appear
        wait.until(EC.presence_of_element_located((By.XPATH, '//td[text()="Status"]/following-sibling::td')))

        # extract fields
        try:
            status = driver.find_element(By.XPATH, '//td[text()="Status"]/following-sibling::td').text.strip()
        except:
            status = ""

        try:
            taxonomy_rows = driver.find_elements(By.XPATH, '//table[@aria-label="Taxonomy"]//tbody/tr')
            taxonomy_data = []
            for row in taxonomy_rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) >= 2:
                    primary = cells[0].text.strip()
                    selected = cells[1].text.strip()
                    taxonomy_data.append((primary, selected))
        except:
            taxonomy_data = []

        try:
            issuer_elements = driver.find_elements(By.XPATH, '//table[@aria-label="Other Identifiers"]//tbody/tr/td[1]')
            issuers = [el.text.strip() for el in issuer_elements if el.text.strip()]
        except:
            issuers = []

        # save current NPI
        current_npi = driver.find_element(By.XPATH, '//td[text()="NPI"]/following-sibling::td').text.strip()

        data.append({
            "NPI": current_npi,
            "Status": status,
            "Taxonomy Entries": taxonomy_data,
            "Issuers": issuers
        })

        driver.back()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr")))

    # click "More" to paginate
    try:
        more_button = driver.find_element(By.XPATH, '//button[contains(text(), "More")]')
        driver.execute_script("arguments[0].scrollIntoView(true);", more_button)
        time.sleep(0.5)
        more_button.click()
    except NoSuchElementException:
        break


<class 'urllib3.exceptions.MaxRetryError'>: HTTPConnectionPool(host='localhost', port=59137): Max retries exceeded with url: /session/e3b8cf9b10e63728dde7044b9d269074/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000028AD53C0310>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
data

In [None]:
# STATUS
# Save Status field from NPI Details site
# only use facilities where Status = "Active"


In [None]:
# Taxonomy 1-10
# Save Taxonomy from NPI Details site
# Taxonomy > Primary Taxonomy 
# Taxonomy > Selected Taxonomy
# Primary Taxonomy (Yes/No)

In [None]:
# Issuer (Medicaid, Other)
# Other Identifiers > Issuer