In [6]:
import pandas as pd


prov = pd.read_parquet(r"C:\Users\ChristopherCato\OneDrive - clarity-dx.com\code\bph\mrf-etl\data\output\processed_uhc_providers.parquet")



In [7]:
prov.columns

Index(['provider_group_id', 'prov_npi', 'tin_type', 'tin_value',
       'reporting_entity_name', 'reporting_entity_type', 'last_updated_on',
       'version', 'enumeration_type', 'org_name', 'status',
       'primary_taxonomy_code', 'primary_taxonomy_desc', 'address_purpose',
       'address_type', 'address_1', 'address_2', 'city', 'state',
       'postal_code', 'country_code', 'telephone_number', 'fax_number',
       'error'],
      dtype='object')

In [13]:
print(f"number of unique tin_value: {prov['tin_value'].nunique()}")

number of unique tin_value: 7089


In [14]:
# Filter for NPI-2 and NPI-1 enumerations
npi2 = prov[prov['enumeration_type'] == 'NPI-2']
npi1 = prov[prov['enumeration_type'] == 'NPI-1']

# Group by TIN and get count of unique NPI-2s, NPI-1s, and their org_names
tin_summary = (
    prov.groupby('tin_value')
        .agg(
            npi2_count=('prov_npi', lambda x: x[prov.loc[x.index, 'enumeration_type'] == 'NPI-2'].nunique()),
            npi1_count=('prov_npi', lambda x: x[prov.loc[x.index, 'enumeration_type'] == 'NPI-1'].nunique()),
            org_names=('org_name', lambda x: list(pd.unique(x.dropna())))
        )
        .reset_index()
        .sort_values('npi2_count', ascending=False)
)

display(tin_summary.head(20))


Unnamed: 0,tin_value,npi2_count,npi1_count,org_names
70,42832065,22,0,"[BIO-MEDICAL APPLICATIONS OF GEORGIA, INC.]"
6978,953372911,19,0,[TOTAL RENAL CARE INC]
3585,582074947,13,0,"[PRUITTHEALTH HOSPICE, INC., PRUITTHEALTH HOSP..."
3449,581928192,9,0,"[PRUITTHEALTH HOME HEALTH - SOUTH ATLANTA, INC..."
3937,582388975,8,0,"[AMERICAN HEALTH IMAGING OF GEORGIA LLC, AMERI..."
4449,621323090,7,0,"[DVA RENAL HEALTHCARE INC, DVA RENAL HEALTHCAR..."
3474,581954432,7,2,"[NORTHSIDE HOSPITAL, INC.]"
4032,582461588,6,0,"[PRUITTHEALTH HOME HEALTH, INC.]"
6977,952977916,3,0,[DVA HEALTHCARE RENAL CARE INC]
3700,582179986,3,2,"[PIEDMONT ATHENS REGIONAL MEDICAL CENTER, INC...."


In [18]:
with pd.option_context('display.max_columns', None):
    print(prov[prov['tin_value'] == '200235522'])

   provider_group_id    prov_npi tin_type  tin_value  \
1                  1  1104897024      ein  200235522   

              reporting_entity_name reporting_entity_type last_updated_on  \
1  UnitedHealthcare of Georgia Inc.               Insurer      2025-08-01   

  version enumeration_type org_name status primary_taxonomy_code  \
1   1.0.0            NPI-1     None      A            207RG0100X   

                 primary_taxonomy_desc address_purpose address_type  \
1  Internal Medicine, Gastroenterology        LOCATION          DOM   

            address_1 address_2        city state postal_code country_code  \
1  1151 CLEVELAND AVE   SUITE D  EAST POINT    GA   303443600           US   

  telephone_number    fax_number error  
1     404-761-7949  404-761-7386  None  


In [22]:
unique_taxonomy = pd.Series(prov['primary_taxonomy_desc'].unique(), name='primary_taxonomy_desc')
unique_taxonomy.to_csv('unique_taxonomy.csv', index=False)

In [23]:
len(prov)

14111

In [24]:
# keep needed taxonomies

import re

def keep_for_surg_rad(desc: str) -> bool:
    if not desc:
        return False
    d = str(desc).upper()

    # 1) Explicit facility keepers
    facility_keep = (
        "GENERAL ACUTE CARE HOSPITAL" in d or
        "GENERAL ACUTE CARE HOSPITAL, CRITICAL ACCESS" in d or
        "AMBULATORY SURGICAL" in d or  # catches "Clinic/Center, Ambulatory Surgical"
        "AMBULATORY SURGICAL CENTER" in d
    )
    if facility_keep:
        return True

    # 2) Simple keyword pass (tighten 'rad' to 'radiolog'/'radiation' to avoid false positives)
    simple_keywords = (
        "SURG",
        "HOSPITAL",
        "AMBULATORY"          # surgery, surgical, surgeon
        "RADIOLOG",       # radiology, radiologist
        "RADIATION",      # radiation oncology (keep if you want it)
        "IMAGING",        # imaging center, diagnostic imaging
        "IMAGE",          # catches "imagin" / typos; okay if you want it very broad
        "MULTI-SPECIALTY",
        "MULTISPECIALTY"
    )
    return any(k in d for k in simple_keywords)

In [27]:
# Example usage on a DataFrame `df` with 'primary_taxonomy_dest
taxonomy_prov = prov['primary_taxonomy_desc'].fillna("").map(keep_for_surg_rad)
kept_df   = prov[taxonomy_prov].copy()
dropped_df= prov[taxonomy_prov].copy()

In [28]:
len(kept_df)

885

In [30]:
print(f"unique taxonomies: {kept_df['primary_taxonomy_desc'].nunique()}")
print(f"unique org_names: {kept_df['org_name'].nunique()}")

print(f"unique taxonomies: {kept_df['primary_taxonomy_desc'].unique()}")
print(f"unique org_names: {kept_df['org_name'].unique()}")

unique taxonomies: 50
unique org_names: 234
unique taxonomies: ['General Acute Care Hospital'
 'Clinic/Center, Magnetic Resonance Imaging (MRI)'
 'Clinic/Center, Oncology, Radiation' 'Physician Assistant, Surgical'
 'Clinic/Center, Ambulatory Surgical'
 'General Acute Care Hospital, Critical Access' 'Plastic Surgery'
 'Hospitalist' 'Clinic/Center, Multi-Specialty'
 'Dentist, Oral and Maxillofacial Surgery' 'Psychiatric Hospital'
 'Long Term Care Hospital' 'Surgery' 'Podiatrist, Foot & Ankle Surgery'
 'Dermatology, MOHS-Micrographic Surgery' 'Surgery, Pediatric Surgery'
 'Orthopaedic Surgery' 'Orthopaedic Surgery, Sports Medicine'
 'Colon & Rectal Surgery' 'Surgery, Plastic and Reconstructive Surgery'
 'Orthopaedic Surgery, Hand Surgery' 'Surgery, Surgical Oncology'
 'Obstetrics & Gynecology, Urogynecology and Reconstructive Pelvic Surgery'
 'Thoracic Surgery (Cardiothoracic Vascular Surgery)'
 'Ophthalmology, Ophthalmic Plastic and Reconstructive Surgery'
 'Neurological Surgery'
 'Orth

In [31]:
print(f"unique states: {kept_df['state'].unique()}")

unique states: ['GA']
