In [190]:
import polars as pl
import json

### Loading the file

Try running this script on this file:

In [211]:
'https://www.dignityhealth.org/content/dam/dignity-health/documents/pricing-procedure-spreadsheets-2020/2023-mrf-price-transparency-files/465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json'

'https://www.dignityhealth.org/content/dam/dignity-health/documents/pricing-procedure-spreadsheets-2020/2023-mrf-price-transparency-files/465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json'

In [191]:
file = open('notebooks/465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json')
data = json.load(file)

In [192]:
for dict_ in data['standard_charge_information']:
    for stdchg in dict_['standard_charges']:
        stdchg.setdefault('gross_charge', None)
        if not any('cash' in key for key in stdchg.keys()):
            stdchg.setdefault('discounted_cash', None)

The above step is necessary since all structs must have the same number of fields. We basically set each struct (dict) to have a default value if they key isn't given.

### Flattening the dataframe piece by piece

In [193]:
df = pl.DataFrame(data['standard_charge_information'])

In [194]:
df = df.explode('standard_charges')

In [195]:
df = df.with_columns([
    pl.col('standard_charges').struct.field('minimum').alias('minimum'),
    pl.col('standard_charges').struct.field('maximum').alias('maximum'),
    pl.col('standard_charges').struct.field('gross_charge').alias('gross_charge'),
    pl.col('standard_charges').struct.field('discounted_cash').alias('discounted_cash'),

    pl.col('standard_charges').struct.field('setting').alias('setting'),
    pl.col('standard_charges').struct.field('payers_information').alias('payers_information'),
    pl.col('standard_charges').struct.field('billing_class').alias('billing_class'),
]).drop('standard_charges')

In [196]:
df = df.explode('payers_information')

In [197]:
df = df.with_columns([
    pl.col('payers_information').struct.field('payer_name').alias('payer_name'),
    pl.col('payers_information').struct.field('plan_name').alias('plan_name'),
    pl.col('payers_information').struct.field('standard_charge').alias('standard_charge'),
    pl.col('payers_information').struct.field('contracting_method').alias('contracting_method'),
    pl.col('payers_information').struct.field('standard_charge_percent').alias('standard_charge_percent'),
]).drop('payers_information')

In [198]:
df = df.with_columns(
    pl.col('billing_code_information').arr.get(0).struct.field('code'),
    pl.col('billing_code_information').arr.get(0).struct.field('type'),
    pl.col('billing_code_information').arr.get(1).struct.field('code').alias('rev_code'),
).drop(['billing_code_information'])

In [199]:
df = df.with_columns([
    pl.when(pl.col('type') == 'CPT').then(pl.col('code')).alias('hcpcs_cpt'),
    pl.when(pl.col('type') == 'MS-DRG').then(pl.col('code')).alias('ms_drg'),
    pl.when(pl.col('type') == 'ICD').then(pl.col('code')).alias('icd'),
])

In [200]:
df.head()

description,minimum,maximum,gross_charge,discounted_cash,setting,billing_class,payer_name,plan_name,standard_charge,contracting_method,standard_charge_percent,code,type,rev_code,hcpcs_cpt,ms_drg,icd
str,f64,f64,null,null,str,str,str,str,f64,str,i64,str,str,str,str,str,str
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""Blue Shield CA…","""Commercial | A…",,"""percent of tot…",94,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""Blue Shield CA…","""Commercial | E…",,"""percent of tot…",71,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""DHR""","""Commercial | A…",,"""percent of tot…",28,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""First Health""","""Commercial | A…",,"""percent of tot…",72,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""MultiPlan""","""Commercial | A…",,"""percent of tot…",77,"""10005""","""CPT""",,"""10005""",,


### Melting the dataframe to unite the fictitious payers with the real payers

In [201]:
fictitious_payers = ['minimum', 'maximum', 'gross_charge', 'discounted_cash', 'discounted_cash']

In [202]:
df_other = df.select(
    c for c in df.columns if c not in 
    ['payer_name', 'plan_name', 'standard_charge', 'contracting_method', 'standard_charge_percent']
)

In [203]:
df_other = df_other.melt(
    id_vars = [c for c in df_other.columns if c not in fictitious_payers],
    value_vars = fictitious_payers,
    variable_name = 'payer_name', 
    value_name = 'standard_charge',
)

In [204]:
df = df.drop(fictitious_payers)

In [205]:
for c in df.columns:
    if c not in df_other:
        dtype = df[c].dtype
        df_other = df_other.with_columns(pl.lit(None).cast(dtype).alias(c))
        
df_other = df_other.select(df.columns)

In [206]:
df = pl.concat([df, df_other])

In [207]:
df = df.with_columns(
    pl
    .when(pl.col('payer_name') == 'minimum').then('min')
    .when(pl.col('payer_name') == 'maximum').then('max')
    .when(pl.col('payer_name') == 'gross_charges').then('gross')
    .when(pl.col('payer_name') == 'discounted_cash').then('cash')
    .otherwise('payer')
    .alias('payer_category')
)

In [208]:
df = df.unique()

In [210]:
df.to_pandas()

Unnamed: 0,description,setting,billing_class,payer_name,plan_name,standard_charge,contracting_method,standard_charge_percent,code,type,rev_code,hcpcs_cpt,ms_drg,icd,payer_category
0,FNA BX W/US GDN 1ST LES,outpatient,facility,DHR,Commercial | All Plans,,percent of total billed charge,28.0,10005,CPT,,10005,,,payer
1,FNA BX W/O IMG GDN 1ST LES,outpatient,facility,First Health,Commercial | All Plans,,percent of total billed charge,72.0,10021,CPT,,10021,,,payer
2,ACNE SURGERY,outpatient,facility,Blue Shield CA,Commercial | All Other Plans,,percent of total billed charge,94.0,10040,CPT,,10040,,,payer
3,DRAINAGE OF SKIN ABSCESS,outpatient,facility,Anthem,Medicare | All Plans,498.24,case rate,,10061,CPT,,10061,,,payer
4,DRAINAGE OF SKIN ABSCESS,outpatient,facility,Kaiser,Medicare | All Plans,498.24,case rate,,10061,CPT,,10061,,,payer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108090,EEG AWAKE AND DROWSY,inpatient,facility,discounted_cash,,,,,95816,CPT,0740,95816,,,cash
108091,ST ASSESS APHASIA PER HR,inpatient,facility,discounted_cash,,,,,96105,CPT,0444,96105,,,cash
108092,PT THERAPY MANUAL EA 15,outpatient,facility,discounted_cash,,,,,97140,CPT,0420,97140,,,cash
108093,PT DEBR SL SESSION ADD 20,outpatient,facility,discounted_cash,,,,,97598,CPT,0420,97598,,,cash
