In [66]:
base_url = 'https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency'

data = [
    {'stdchg_file_url':f'{base_url}/272953799_aurora-medical-center-grafton_standardcharges.xml', 'id':'520207'},
    {'stdchg_file_url':f'{base_url}/390806347_aurora-lakeland-medical-center_standardcharges.xml', 'id':'520102'},
    {'stdchg_file_url':f'{base_url}/390806347_aurora-medical-center-burlington_standardcharges.xml', 'id':'520059'},
    {'stdchg_file_url':f'{base_url}/390806347_aurora-medical-center-kenosha_standardcharges.xml', 'id':'520189'},
    {'stdchg_file_url':f'{base_url}/390806347_aurora-medical-center-summit_standardcharges.xml', 'id':'520206'},
    {'stdchg_file_url':f'{base_url}/391947472_aurora-baycare-medical-center_standardcharges.xml', 'id':'520193'},
    {'stdchg_file_url':f'{base_url}/390930748_aurora-sheboygan-memorial-medical-center_standardcharges.xml', 'id':'520035'},
    {'stdchg_file_url':f'{base_url}/391027676_aurora-medical-center-oshkosh_standardcharges.xml', 'id':'520198'},
    {'stdchg_file_url':f'{base_url}/391150165_aurora-medical-center-washington-county_standardcharges.xml', 'id':'520038'},
    {'stdchg_file_url':f'{base_url}/391211629_aurora-medical-center-manitowoc-county_standardcharges.xml', 'id':'520034'},
    {'stdchg_file_url':f'{base_url}/391528430_aurora-medical-center-bay-area_standardcharges.xml', 'id':'520113'},
]

transparency_page = 'https://www.aurorahealthcare.org/patients-visitors/billing-payment/health-care-costs'

In [67]:
import pandas as pd
import polars as pl

In [118]:
renaming = {
    'Type':'line_type',
    'Chargecode_DRG_CPT':'code',
    'Description':'description',
    'Rev':'rev_code', 
    'CPT':'hcpcs_cpt', 
    'NDC':'ndc',
}

id_vars = ['line_type','code','description','rev_code','hcpcs_cpt','ndc',]

def setting():
    return (
        pl
        .when(pl.col('line_type').str.contains('^IP '))
        .then('inpatient')
        .when(pl.col('line_type').str.contains('^OP '))
        .then('outpatient')
        .otherwise(1)
    ).alias('setting')

def rev_code():
    # rev code initially parsed as float by pandas
    return (
        pl.col('rev_code').cast(int).cast(str).str.zfill(4).fill_null('')
    )

def ms_drg():
    # use line type to identify ms-drg
    return (
        pl.when(pl.col('line_type').str.contains('DRG'))
        .then(pl.col('code'))
        .otherwise("")
    ).alias('ms_drg')

def standard_charge():
    # casting to float will raise an error if some of the
    # numbers aren't numeric
    return pl.col('standard_charge').str.replace_all(',', '').cast(float)

def payer_category():
    # 1_1_23_fee is the fee as of 2023-01-01 (last_updated)
    return (
        pl
        .when(pl.col('payer').str.contains('_1_1_23_Fee')).then('gross')
        .when(pl.col('payer').str.contains('Max')).then('max')
        .when(pl.col('payer').str.contains('Min')).then('min')
        .when(pl.col('payer').str.contains('Self_Pay')).then('cash')
        .otherwise('payer')
        ).alias('payer_category')

def hcpcs_cpt():
    # hcpcs codes are sometimes hidden in line_type
    # outpatient procedures
    return (
        pl.when(pl.col('line_type') == 'OP PROC*')
        .then(pl.col('code'))
        .otherwise(pl.col('hcpcs_cpt'))
    ).alias('hcpcs_cpt')

In [119]:
dfs = []
qs = []

from tqdm import tqdm

for row in tqdm(data):
    
    stdchg_file_url = row['stdchg_file_url']
    id = row['id']
    
    df = (
            pl.from_pandas(pd.read_xml(stdchg_file_url))
            .drop('Facility')
            .rename(renaming)
            .melt(id_vars = id_vars, variable_name = 'payer', value_name = 'standard_charge')
            .with_columns([
                setting(),
                rev_code(),
                ms_drg(),
                standard_charge(),
                payer_category(),
                hcpcs_cpt(),
            ])
            .with_columns([
                # some of the hcpcs codes are lowercased
                pl.col(['hcpcs_cpt', 'ndc']).str.to_uppercase().fill_null(""),
                pl.lit(id).alias('hospital_id')
            ])
    )
    
    dfs.append(df)
    
    file_name = stdchg_file_url.split('/')[-1]
    ein = file_name.split('_')[0]
    ein = f'{ein[:2]}-{ein[2:]}'
    last_updated = '2023-01-01'
    
    query = f"""
    update hospital set 
    {ein=}, 
    {file_name=}, 
    {last_updated=}, 
    {stdchg_file_url=}, 
    {transparency_page=} 
    where {id=}
    """
    qs.append(query)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:16<00:00,  1.52s/it]


In [121]:
# df = pl.concat(dfs)
df.write_csv('aurora.csv')

In [128]:
def correction_hcpcs():
    """
    cause: Check constraint "hcpcs_cpt_fmt" violated
    A bad row was encountered: [520113,CHARGE,RFEE 9,0300,,10006652,,,,RFEE9,,<nil>,<nil>,<nil>,,<nil>,<nil>,<nil>,1,1,gross,_1_1_23_Fee,,18.0,<nil>,<nil>,<nil>]: Check constraint "hcpcs_cpt_fmt" violated
    """
    return (
        pl.when(pl.col('hcpcs_cpt').str.contains('RFEE')).then('').otherwise(pl.col('hcpcs_cpt')).alias('hcpcs_cpt')
    )

In [130]:
df = df.with_columns(
    correction_hcpcs()
)

In [131]:
# df = pl.concat(dfs)
df.write_csv('aurora.csv')

In [117]:
with open('aurora.sql', 'w+') as f:  
    for q in qs:
        f.write(q + ';\n')
        
# dolt sql < aurora.sql