In [58]:
import polars as pl
import json

### Loading the file

Try running this script on this file:

In [59]:
'https://www.dignityhealth.org/content/dam/dignity-health/documents/pricing-procedure-spreadsheets-2020/2023-mrf-price-transparency-files/465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json'

'https://www.dignityhealth.org/content/dam/dignity-health/documents/pricing-procedure-spreadsheets-2020/2023-mrf-price-transparency-files/465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json'

In [60]:
file = open('465322209_St-Josephs-Behavioral-Health-Center_standardcharges.json')
data = json.load(file)

In [61]:
for dict_ in data['standard_charge_information']:
    for stdchg in dict_['standard_charges']:
        stdchg.setdefault('gross_charge', None)
        if not any('cash' in key for key in stdchg.keys()):
            stdchg.setdefault('discounted_cash', None)

The above step is necessary since all structs must have the same number of fields. We basically set each struct (dict) to have a default value if they key isn't given.

### Flattening the dataframe piece by piece

In [62]:
df = pl.DataFrame(data['standard_charge_information'])

In [63]:
df = df.explode('standard_charges')

In [64]:
df = df.with_columns([
    pl.col('standard_charges').struct.field('minimum').alias('minimum'),
    pl.col('standard_charges').struct.field('maximum').alias('maximum'),
    pl.col('standard_charges').struct.field('gross_charge').alias('gross_charge'),
    pl.col('standard_charges').struct.field('discounted_cash').alias('discounted_cash'),

    pl.col('standard_charges').struct.field('setting').alias('setting'),
    pl.col('standard_charges').struct.field('payers_information').alias('payers_information'),
    pl.col('standard_charges').struct.field('billing_class').alias('billing_class'),
]).drop('standard_charges')

In [65]:
df = df.explode('payers_information')

In [66]:
df = df.with_columns([
    pl.col('payers_information').struct.field('payer_name').alias('payer_name'),
    pl.col('payers_information').struct.field('plan_name').alias('plan_name'),
    pl.col('payers_information').struct.field('standard_charge').alias('standard_charge'),
    pl.col('payers_information').struct.field('contracting_method').alias('contracting_method'),
    pl.col('payers_information').struct.field('standard_charge_percent').alias('standard_charge_percent'),
]).drop('payers_information')

In [67]:
df = df.with_columns(
    pl.col('billing_code_information').arr.get(0).struct.field('code'),
    pl.col('billing_code_information').arr.get(0).struct.field('type'),
    pl.col('billing_code_information').arr.get(1).struct.field('code').alias('rev_code'),
).drop(['billing_code_information'])

In [68]:
df = df.with_columns([
    pl.when(pl.col('type') == 'CPT').then(pl.col('code')).alias('hcpcs_cpt'),
    pl.when(pl.col('type') == 'MS-DRG').then(pl.col('code')).alias('ms_drg'),
    pl.when(pl.col('type') == 'ICD').then(pl.col('code')).alias('icd'),
])

In [69]:
df.head()

description,minimum,maximum,gross_charge,discounted_cash,setting,billing_class,payer_name,plan_name,standard_charge,contracting_method,standard_charge_percent,code,type,rev_code,hcpcs_cpt,ms_drg,icd
str,f64,f64,null,null,str,str,str,str,f64,str,i64,str,str,str,str,str,str
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""Blue Shield CA…","""Commercial | A…",,"""percent of tot…",94,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""Blue Shield CA…","""Commercial | E…",,"""percent of tot…",71,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""DHR""","""Commercial | A…",,"""percent of tot…",28,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""First Health""","""Commercial | A…",,"""percent of tot…",72,"""10005""","""CPT""",,"""10005""",,
"""FNA BX W/US GD…",866.71,866.71,,,"""outpatient""","""facility""","""MultiPlan""","""Commercial | A…",,"""percent of tot…",77,"""10005""","""CPT""",,"""10005""",,


### Melting the dataframe to unite the fictitious payers with the real payers

In [70]:
fictitious_payers = ['minimum', 'maximum', 'gross_charge', 'discounted_cash', 'discounted_cash']

In [71]:
df_other = df.select(
    c for c in df.columns if c not in 
    ['payer_name', 'plan_name', 'standard_charge', 'contracting_method', 'standard_charge_percent']
)

In [72]:
df_other = df_other.melt(
    id_vars = [c for c in df_other.columns if c not in fictitious_payers],
    value_vars = fictitious_payers,
    variable_name = 'payer_name', 
    value_name = 'standard_charge',
)

In [73]:
df = df.drop(fictitious_payers)

In [74]:
for c in df.columns:
    if c not in df_other:
        dtype = df[c].dtype
        df_other = df_other.with_columns(pl.lit(None).cast(dtype).alias(c))
        
df_other = df_other.select(df.columns)

In [75]:
df = pl.concat([df, df_other])

In [76]:
df = df.with_columns(
    pl
    .when(pl.col('payer_name') == 'minimum').then('min')
    .when(pl.col('payer_name') == 'maximum').then('max')
    .when(pl.col('payer_name') == 'gross_charges').then('gross')
    .when(pl.col('payer_name') == 'discounted_cash').then('cash')
    .otherwise('payer')
    .alias('payer_category')
)

In [77]:
df = df.unique()

In [78]:
df = df.with_columns(
    pl.when(pl.col('standard_charge') == -1).then(None).otherwise(pl.col('standard_charge')).keep_name()
)

In [81]:
df = df.filter(~(pl.col('standard_charge').is_null() & pl.col('standard_charge_percent').is_null()))

In [82]:
df.sample(100)

description,setting,billing_class,payer_name,plan_name,standard_charge,contracting_method,standard_charge_percent,code,type,rev_code,hcpcs_cpt,ms_drg,icd,payer_category
str,str,str,str,str,f64,str,i64,str,str,str,str,str,str,str
"""ALOGLIPTIN 6.2…","""outpatient""","""facility""","""DHR""","""Commercial | A…",14.0,"""fee schedule""",,"""6381752""","""CDM""","""0637""",,,,"""payer"""
"""RL-Q-COXSAK B …","""outpatient""","""facility""","""DHR""","""Commercial | A…",1.11,"""fee schedule""",,"""86658""","""CPT""","""0302""","""86658""",,,"""payer"""
"""IA INFLUENZA R…","""outpatient""","""facility""","""Anthem""","""Commercial | E…",329.42,"""fee schedule""",,"""87502""","""CPT""","""0306""","""87502""",,,"""payer"""
"""RL-A-PM/SCL 20…","""outpatient""","""facility""","""Blue Shield CA…","""Commercial | E…",46.15,"""fee schedule""",,"""86235""","""CPT""","""0302""","""86235""",,,"""payer"""
"""KIDNEY ENDOSCO…","""outpatient""","""facility""","""Blue Shield CA…","""Commercial | E…",,"""percent of tot…",71,"""50572""","""CPT""",,"""50572""",,,"""payer"""
"""SINUS SURGERY …","""outpatient""","""facility""","""DHR""","""Commercial | A…",,"""percent of tot…",28,"""31299""","""CPT""",,"""31299""",,,"""payer"""
"""INSERTION OF C…","""outpatient""","""facility""","""First Health""","""Commercial | A…",,"""percent of tot…",72,"""36800""","""CPT""",,"""36800""",,,"""payer"""
"""INJECTION TREA…","""outpatient""","""facility""","""Blue Shield CA…","""Commercial | E…",,"""percent of tot…",71,"""66030""","""CPT""",,"""66030""",,,"""payer"""
"""SJM-CULT BODY …","""inpatient""","""facility""","""maximum""",,19.25,,,"""87070""","""CPT""","""0306""","""87070""",,,"""max"""
"""H-FOLATE SERUM…","""outpatient""","""facility""","""MultiPlan""","""Commercial | A…",149.38,"""fee schedule""",,"""82746""","""CPT""","""0301""","""82746""",,,"""payer"""


In [83]:
len(df)

87027

In [None]:
df.write_csv('dignity_test.csv')