In [4]:
import requests
import pandas as pd
import numpy as np
import time
import re

from fuzzywuzzy import fuzz



In [5]:
# API base URL and parameters
api_url = "https://npiregistry.cms.hhs.gov/api/"
params = {
    "version": "2.1",
    "enumeration_type": "NPI-2",
    "state": "CO",
    "taxonomy_description": "hospice",
    "limit": 200,  # max allowed per call
    "skip": 0
}

all_results = []

while True:
    response = requests.get(api_url, params=params)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        break
    
    data = response.json()
    results = data.get("results", [])
    
    if not results:
        break
    
    for entry in results:
        npi = entry.get("number", "")
        basic = entry.get("basic", {})
        name = basic.get("organization_name", "")
        status = basic.get("status", "")
        
        addresses = entry.get("addresses", [])
        location_address = next((a for a in addresses if a.get("address_purpose") == "LOCATION"), {})
        
        # Build formatted address string
        def format_address(addr):
            return f"{addr.get('address_1', '')} {addr.get('address_2', '')}, {addr.get('city', '')}, {addr.get('state', '')} {addr.get('postal_code', '')}".strip()

        location_full = format_address(location_address)
        location_state = location_address.get("state", "")

        # Taxonomy and identifiers
        taxonomies = entry.get("taxonomies", [])
        taxonomy_entries = [(t.get("desc", ""), t.get("primary")) for t in taxonomies]

        identifiers = entry.get("identifiers", [])
        issuers = [iden.get("issuer", "") for iden in identifiers if iden.get("issuer")]

        all_results.append({
            "NPI": npi,
            "Name": name,
            "Status": status,
            "Location State": location_state,
            "Primary Practice Address": location_full,
            "Taxonomy Entries": taxonomy_entries,
            "Issuers": issuers
        })

    params["skip"] += 200
    if len(results) < 200:
        break

# Convert to DataFrame
df = pd.DataFrame(all_results)


In [6]:
df

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[]
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[]
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[]
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[]
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[]
...,...,...,...,...,...,...,...
297,1194839001,WYOMING HOME HEALTH INC,A,WY,"1103 E BOXELDER RD STE JB , GILLETTE, WY 82718...","[(Case Management, False), (Day Training, Deve...",[]
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[]
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[]
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[]


In [4]:
# ZIP9 (may have zip9 or zip5) #***make sure that it's taking Primary Practice Address and NOT mailing address
# check that CO is in either the Primary Practice Address or Secondary Practice Address
df["ZIP9"] = df["Primary Practice Address"].str.extract(r'(\d{5}(?:\d{4})?)$')
df["ZIP9"]

0      802372857
1      810034430
2      800142732
3      810082130
4      809183842
         ...    
297    827185557
298        80014
299    802317558
300        80903
301    800142735
Name: ZIP9, Length: 302, dtype: object

In [8]:
# 5 digit zip (ZIP5)
df['ZIP5'] = df['Primary Practice Address'].str.extract(r'(\d{5})(?:-\d{4}|\d{4})?$')

df['ZIP5']

0      80237
1      81003
2      80014
3      81008
4      80918
       ...  
297    82718
298    80014
299    80231
300    80903
301    80014
Name: ZIP5, Length: 302, dtype: object

In [6]:
#Making sure it's getting the correct zip for addresses that have building #
df[df.Name.str.contains('AMARIS')]

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5
22,1699447896,AMARIS HOSPICE CARE,A,CO,"11059 E BETHANY DR STE 105B , AURORA, CO 80014...","[(Hospice, Inpatient, False), (Hospice Care, C...",[],800142617,80014


In [10]:
# Hospice facilities status - all Active
df["Status"].value_counts()

Status
A    302
Name: count, dtype: int64

In [11]:
# Are there any duplicate NPIs? - NO
df[df["NPI"].duplicated(keep=False)]

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5


In [12]:
# Are there any duplicate Names? YES 117 are duplicate names
#df[df["Name"].duplicated(keep=False)]

In [13]:
def contains_hospice(entry_list):
    for entry in entry_list:
        for field in entry:
            if isinstance(field, str) and "hospice" in field.lower():
                return True
    return False

# Apply the function
has_hospice = df["Taxonomy Entries"].apply(contains_hospice)

# Summary
print("Total NPIs:", len(df))
print("NPIs with at least one 'Hospice' taxonomy:", has_hospice.sum())

if has_hospice.all():
    print("All NPIs have at least one taxonomy with 'Hospice'.")
else:
    print("Some NPIs are missing a 'Hospice' taxonomy.")
    display(df[~has_hospice][["NPI", "Name", "Taxonomy Entries"]])


Total NPIs: 302
NPIs with at least one 'Hospice' taxonomy: 302
All NPIs have at least one taxonomy with 'Hospice'.


In [14]:
# Are there any duplicate Names AND Address? YES - 39, REMOVE
#df[df.duplicated(subset=["Name", "Primary Practice Address"], keep=False)]

In [15]:
# removing duplicates by name and Primary Care Address
df = df.drop_duplicates(subset=["Name", "Primary Practice Address"], keep="first")

In [16]:
df #280 total

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[],802372857,80237
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[],810034430,81003
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[],800142732,80014
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[],810082130,81008
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[],809183842,80918
...,...,...,...,...,...,...,...,...,...
296,1346699725,WYOMING HOME HEALTH INC,A,WY,"1103 E BOXELDER RD STE JB , GILLETTE, WY 82718...","[(Case Management, False), (Home Health, False...",[],827185557,82718
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[],80014,80014
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[],802317558,80231
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[],80903,80903


In [17]:
df["Issuers"].value_counts()

Issuers
[]                                                                 267
[NPI, LICENSE]                                                       1
[State ID]                                                           1
[State of Colorado Department of Public Health and Environment]      1
[License Number]                                                     1
[Nebraska State Licensure]                                           1
[State Hospice Licensure]                                            1
[CO State License Number]                                            1
[Colorado license]                                                   1
[License]                                                            1
[Provider Number]                                                    1
[MEDICARE]                                                           1
[BCBS, HMO of CO, Medicare B]                                        1
[Colorado State License]                                             

In [18]:
# how many practices are not in CO? --10
df["Location State"].value_counts()

Location State
CO    270
WY      3
MT      2
NE      1
DE      1
NJ      1
TX      1
NM      1
Name: count, dtype: int64

In [19]:
# remove any non-CO primary care addresses
df = df[df["Location State"] == "CO"]

In [20]:
df.info() # 270 NPIs after removing dupe NPI+Name and Non-CO addresses

<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 0 to 301
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   NPI                       270 non-null    object
 1   Name                      270 non-null    object
 2   Status                    270 non-null    object
 3   Location State            270 non-null    object
 4   Primary Practice Address  270 non-null    object
 5   Taxonomy Entries          270 non-null    object
 6   Issuers                   270 non-null    object
 7   ZIP9                      270 non-null    object
 8   ZIP5                      270 non-null    object
dtypes: object(9)
memory usage: 21.1+ KB


In [21]:
# fuzzy matching 

# Load the CMS facility data
#df_cms = pd.read_csv("C:/Users/angel/OneDrive/Desktop/DU/COMP 4447 Data Science Tools I/Final Project/CMS_hospice_organizations_CO.csv")

df_cms = pd.read_csv("CMS_hospice_organizations_CO.csv")

In [22]:
# change df_cms to str to match df
df_cms['ZIP5'] = df_cms['ZIP Code'].astype(str).str.zfill(5)
df_cms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   CMS Certification Number (CCN)  84 non-null     int64  
 1   Facility Name                   84 non-null     object 
 2   Address Line 1                  84 non-null     object 
 3   Address Line 2                  0 non-null      float64
 4   City/Town                       84 non-null     object 
 5   State                           84 non-null     object 
 6   ZIP Code                        84 non-null     int64  
 7   County/Parish                   70 non-null     object 
 8   Telephone Number                84 non-null     object 
 9   CMS Region                      84 non-null     int64  
 10  Measure Code                    84 non-null     object 
 11  Measure Name                    84 non-null     object 
 12  Score                           84 non

In [23]:
# Normalize facility names by removing suffixes and symbols
def normalize_name(name):
    name = str(name).lower()
    name = re.sub(r',.*$', '', name)  # Remove everything after comma
    name = re.sub(r'\b(inc|llc|corp|corporation|co|ltd|pllc|incorporated)\b', '', name)
    name = re.sub(r'\W+', '', name)   # Remove non-alphanumeric
    return name.strip()

# Apply to both dataframes
df['clean_name'] = df['Name'].apply(normalize_name)
df_cms['clean_name'] = df_cms['Facility Name'].apply(normalize_name)

print(df['clean_name'].head())
print(df_cms['clean_name'].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_name'] = df['Name'].apply(normalize_name)


0          247homehealthcare
1     abettercoloradohospice
2    apeacefuljourneyhospice
3    abodehealthcarecolorado
4    abodehealthcarecolorado
Name: clean_name, dtype: object
0    pikespeakhospiceandpalliativecare
1            trucommunitycarelafayette
2      bannerhospiceofnortherncolorado
3                     denverhospicethe
4          lamarareahospiceassociation
Name: clean_name, dtype: object


In [24]:
def fuzzy_match(row, df2, name_threshold=90, zip_threshold=100):
    best_match_index = -1
    highest_score = 0

    for index, row2 in df2.iterrows():
      name_score = fuzz.token_set_ratio(row['clean_name'], row2['clean_name'])
      zip_score = fuzz.token_set_ratio(row['ZIP5'], row2['ZIP5'])
      
      # Combine scores (you can adjust the weights as needed)
      combined_score = (name_score + zip_score) / 2

      if name_score >= name_threshold and zip_score >= zip_threshold and combined_score > highest_score:
          highest_score = combined_score
          best_match_index = index

    if best_match_index != -1:
        return df2.loc[best_match_index, 'clean_name']
    else:
        return None

# Apply the fuzzy matching function to each row in df1
df['Matched_ID2'] = df.apply(lambda row: fuzzy_match(row, df_cms), axis=1)

print(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Matched_ID2'] = df.apply(lambda row: fuzzy_match(row, df_cms), axis=1)


            NPI                                   Name Status Location State  \
0    1760093470               247 HOME HEALTH CARE LTD      A             CO   
1    1740072065          A BETTER COLORADO HOSPICE LLC      A             CO   
2    1003483330        A PEACEFUL JOURNEY HOSPICE, LLC      A             CO   
3    1861097982         ABODE HEALTHCARE COLORADO, INC      A             CO   
4    1326459025         ABODE HEALTHCARE COLORADO, INC      A             CO   
..          ...                                    ...    ...            ...   
294  1518171834               WOMENS HEALTH CENTER INC      A             CO   
298  1013648583                        YNA HOSPICE INC      A             CO   
299  1285645382                    YULIYA GOSTISHCHEVA      A             CO   
300  1942931415                         ZA HOSPICE INC      A             CO   
301  1477372571  ZION SELAH HOME HEALTHCARE AGENCY LLC      A             CO   

                              Primary P

In [25]:
df_fields = df.loc[:,['Name','ZIP5','clean_name','Matched_ID2']]
df_fields.to_csv("matched_facilities_cms.csv", index=False)

In [26]:
# Load the file
with open("CO County Alzheimers Rates.txt", "r") as file:
    lines = file.readlines()

data = []

for line in lines:
    # Match the line by finding:
    # 1. County name(s): words and spaces
    # 2. Total population: numbers, commas
    # 3. AD cases: can include "<" and spaces
    # 4. Prevalence: ends with %
    match = re.match(r"^([A-Za-z\s]+)\s+([\d,]+)\s+([<\d,\s]+)\s+([\d.]+%)", line)
    if match:
        county = match.group(1).strip()
        total_pop = match.group(2).strip()
        ad_cases = match.group(3).strip()
        prevalence = match.group(4).strip()
        data.append([county, total_pop, ad_cases, prevalence])
    else:
        print("Line skipped (no match):", line.strip())

# Create DataFrame
df_alz = pd.DataFrame(data, columns=[
    "County",
    "Total Pop. Age 65+",
    "AD Cases Age 65+",
    "AD Prevalence (Age 65+)"
])

# Show the first few rows
df_alz

Unnamed: 0,County,Total Pop. Age 65+,AD Cases Age 65+,AD Prevalence (Age 65+)
0,Adams,57400,6100,10.7%
1,Alamosa,2400,300,11.5%
2,Arapahoe,92000,9800,10.6%
3,Archuleta,3900,300,8.8%
4,Baca,900,100,12.5%
...,...,...,...,...
59,Summit,4500,300,7.4%
60,Teller,6200,500,7.7%
61,Washington,1000,100,10.8%
62,Weld,42500,4300,10.1%


In [27]:
#df_alz.to_csv("CO_county_Alzheimers_Rates.csv", index=False)

In [28]:
df.to_pickle('df_zip.pkl')
df_alz.to_pickle('df_alzheimers.pkl')

In [None]:
df.to_pickle('df.pkl')