In [1]:
import polars as pl

In [254]:
url = '74-2730328_methodist-ambulatory-surgery-hospital_standardcharges.csv'

df = pl.read_csv(url, skip_rows = 1, infer_schema_length = 0)

In [274]:
def top_part(df):
    df = df.with_row_count()
    
    for row in df.iter_rows(named = True):
        if row['HCPCS/CPT Code'] == 'Coding':
            break_pt = row['row_nr'] - 1
            break
    
    df = (df
          .rename({
              'Procedure ID':'local_code',
              'HCPCS/CPT Code':'hcpcs_cpt',
              'Description':'description',
          })
          .drop('row_nr'))
            
    return df[:break_pt]

In [282]:
top_df = (
    top_part(df).with_columns(
        pl.col('hcpcs_cpt')
        .str.strip()
        .str.strip('0')
    )
    .with_columns(
        pl.when(pl.col('hcpcs_cpt').str.lengths() == 0)
        .then(None)
        .otherwise(pl.col('hcpcs_cpt'))
        .alias('hcpcs_cpt')
    )
    .melt(
        ['local_code', 'hcpcs_cpt', 'description'], 
        value_name = 'standard_charge', 
        variable_name = 'payer_name'
    )
    .with_columns(
        pl.when(pl.col('payer_name').str.contains('Gross'))
        .then('gross')
        .when(pl.col('payer_name').str.contains('Cash'))
        .then('cash')
        .alias('payer_category')
    )
)

In [283]:
top_df.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category
str,str,str,str,str,str
"""707905""",,"""CATH BARD A 20…","""Discounted Cas…","""87.00""","""gross"""
"""676210""",,"""PLATE TIV 750 …","""Discounted Cas…","""11913.00""","""gross"""
"""641403""","""C1877""","""STENT VISIPRO …","""Gross Charge""","""5636.00""","""gross"""
"""300792""",,"""VARENICLINE TA…","""Discounted Cas…","""23.40""","""gross"""
"""641786""",,"""CAGE SPN BANAN…","""Discounted Cas…","""13976.00""","""gross"""
"""676883""",,"""SHEL ACET 44 …","""Discounted Cas…","""7195.00""","""gross"""
"""634539""","""C1768""","""GRAFT VECTRA 5…","""Discounted Cas…","""4741.00""","""gross"""
"""630110""","""C1772""","""PUMP #8627L18 …","""Discounted Cas…","""37492.00""","""gross"""
"""295954""",,"""LIDO 0.5% BUFF…","""Gross Charge""","""55.06""","""gross"""
"""630087""",,"""CLIP ANEURYSM …","""Discounted Cas…","""1081.00""","""gross"""


In [285]:
def payer_dfs(df):
    df = df.with_row_count()
    
    slice_ids = []
    for row in df.iter_rows(named = True):
        if row['HCPCS/CPT Code'] == 'Coding':
            slice_ids.append(row['row_nr'] - 1)
            
    slices = [(s, t-1) for s, t in zip(slice_ids, slice_ids[1:])]
    
    rename_dict = {
        'Procedure ID':'description',
        'HCPCS/CPT Code':'code',
        'Description':'standard_charge_',
    }
    
    payer_dfs = []
    for slice_ in slices:
        
        payer_df = (
            df.slice(slice_[0], slice_[1] - slice_[0] + 1)
            .rename(rename_dict)
        )
        
        payer_name = payer_df['description'][0]
        
        payer_df = (
            payer_df
            .slice(2,)
            .drop(['Gross Charge', 'Discounted Cash Price (Gross Charges)', 'row_nr'])
            .with_columns([
                pl.lit(payer_name).alias('payer_name'),
                pl.lit('payer').alias('payer_category'),
            ])
        )
        
        payer_dfs.append(payer_df)
        
    return payer_dfs

In [286]:
payer_dfs = df.pipe(payer_dfs)

In [288]:
def charges():
    
    standard_charge = pl.col('standard_charge_')
    percent_bc = standard_charge.str.contains('% of BC')
    percent_mcr = standard_charge.str.contains('% of MCR')
    percent_fs = standard_charge.str.contains('% of FS')
    percent_mcd = standard_charge.str.contains('% of MCR')
    
    contracting_method = (
        pl.when(percent_bc).then('percent of total billed charges')
        .when(percent_mcr).then('other')
        .when(percent_fs).then('other')
        .when(percent_mcd).then('other')
        .otherwise(None)
    ).alias('contracting_method')
    
    standard_charge_percent = (
        pl.when(percent_bc).then(standard_charge.str.extract('(\d+\.?\d+)\%'))
        .when(percent_mcr).then(standard_charge.str.extract('(\d+\.?\d+)\%'))
        .when(percent_fs).then(standard_charge.str.extract('(\d+\.?\d+)\%'))
        .when(percent_mcd).then(standard_charge.str.extract('(\d+\.?\d+)\%'))
        .otherwise(None)
    ).alias('standard_charge_percent')
    
    additional_generic_notes = (
        pl.when(percent_mcr).then('billed as % of MCR')
        .when(percent_fs).then('billed as % of FS')
        .when(percent_mcd).then('billed as % of MCD')
        .otherwise(None)
    ).alias('additional_generic_notes')
    
    standard_charge = (
        pl.when(percent_bc).then(None)
        .when(percent_mcr).then(None)
        .when(percent_fs).then(None)
        .when(percent_mcd).then(None)
        .otherwise(standard_charge.str.replace('\$', '').str.replace_all(',', ''))
    ).alias('standard_charge')
    
    return standard_charge, standard_charge_percent, contracting_method, additional_generic_notes

In [289]:
new_payer_dfs = []
for pdf in payer_dfs:
    pdf = pdf.with_columns([
        *charges()
    ])
    new_payer_dfs.append(pdf)

In [290]:
pdfs = pl.concat(new_payer_dfs)

In [291]:
begin = '(^|\\b)'
end = '($|\\b)'

cpt_pat1 = '[A-Z]\d{4}' #A1234
cpt_pat2 = '\d{4}[A-Z]' #1234A
cpt_pat3 = '\d{5}'      #12345

cpt_pats = '|'.join([f'{begin}{pat}{end}' for pat in [cpt_pat1, cpt_pat2, cpt_pat3]])

pdfs = pdfs.with_columns(
    pl.col('code').str.split(';').alias('code')
).explode('code').with_columns([
    pl.when(pl.col('code').str.contains('DRG')).then(pl.col('code')).otherwise(None).alias('ms_drg'),
    pl.when(pl.col('code').str.contains(f'CPT/HCPC|{cpt_pats}')).then(pl.col('code')).otherwise(None).alias('hcpcs_cpt'),
    pl.when(pl.col('code').str.contains('ICD')).then(pl.col('code')).otherwise(None).alias('icd'),
])

In [292]:
pdfs

description,code,standard_charge_,payer_name,payer_category,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes,ms_drg,hcpcs_cpt,icd
str,str,str,str,str,str,str,str,str,str,str,str
"""Other Inpatien…",,"""75% of BC""","""Accountable He…","""payer""",,"""75""","""percent of tot…",,,,
"""Other Outpatie…",,"""75% of BC""","""Accountable He…","""payer""",,"""75""","""percent of tot…",,,,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""CPT/HCPC 92928…",
"""Angioplasty""","""CPT/HCPC 35450…","""$17,473.60""","""Aetna""","""payer""","""17473.60""",,,,,"""CPT/HCPC 35450…",
"""Behavioral Hea…","""CPT/HCPC H0015…","""$278.00""","""Aetna""","""payer""","""278.00""",,,,,"""CPT/HCPC H0015…",
"""Behavioral Hea…","""CPT/HCPC H0035…","""$551.80""","""Aetna""","""payer""","""551.80""",,,,,"""CPT/HCPC H0035…",
"""Cardiac Cath""","""CPT/HCPC G0448…","""$9,108.80""","""Aetna""","""payer""","""9108.80""",,,,,"""CPT/HCPC G0448…",
"""Cardiac Cath""",""" 93563-93568""","""$9,108.80""","""Aetna""","""payer""","""9108.80""",,,,,""" 93563-93568""",
"""Cardiology""","""MS-DRG 222-225…","""$40,048.80""","""Aetna""","""payer""","""40048.80""",,,,"""MS-DRG 222-225…",,
"""Cardiology""","""MS-DRG 226, 22…","""$37,370.20""","""Aetna""","""payer""","""37370.20""",,,,"""MS-DRG 226, 22…",,


In [293]:
pdfs = (
    pdfs
    
    # Yes, we could write a function to do this, but it's fine.
    # Sometimes it's okay to repeat yourself.
    # Handle HCPCS case
    .with_columns([
        pl.col('hcpcs_cpt').str.replace('CPT/HCPC', '').str.split(','),
    ])
    .explode(['hcpcs_cpt'])
    .with_columns([
        pl.col('hcpcs_cpt').str.strip().str.split('-').alias('hcpcs_cpt_range_')
    ])
    .with_columns([
        pl.col('hcpcs_cpt_range_').arr.get(0).alias('hcpcs_cpt'),
        pl.col('hcpcs_cpt_range_').arr.get(1).alias('hcpcs_thru')
    ])
    
    # MS-DRG case
    .with_columns([
        pl.col('ms_drg').str.replace('MS-DRG', '').str.split(','),
    ])
    .explode(['ms_drg'])
    .with_columns([
        pl.col('ms_drg').str.strip().str.split('-').alias('ms_drg_range_')
    ])
    .with_columns([
        pl.col('ms_drg_range_').arr.get(0).alias('ms_drg'),
        pl.col('ms_drg_range_').arr.get(1).alias('ms_drg_thru')
    ])
    
    # ICD case
    .with_columns([
        pl.col('icd').str.replace('ICD 9/10', '').str.split(','),
    ])
    .explode(['icd'])
    .with_columns([
        pl.col('icd').str.strip().str.split('-').alias('icd_range_')
    ])
    .with_columns([
        pl.col('icd_range_').arr.get(0).alias('icd'),
        pl.col('icd_range_').arr.get(1).alias('icd_thru')
    ])
    
    # Create one combined column with all the "thru"s
    .with_columns(
        pl.col('hcpcs_thru').fill_null(pl.col('icd_thru')).fill_null(pl.col('ms_drg_thru')).alias('thru')
    )
    
    # Drop the unneeded columns
    .drop(['hcpcs_cpt_range_', 'hcpcs_thru', 'ms_drg_range_', 'ms_drg_thru', 'icd_range_', 'icd_thru'])

)

In [294]:
pdfs

description,code,standard_charge_,payer_name,payer_category,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes,ms_drg,hcpcs_cpt,icd,thru
str,str,str,str,str,str,str,str,str,str,str,str,str
"""Other Inpatien…",,"""75% of BC""","""Accountable He…","""payer""",,"""75""","""percent of tot…",,,,,
"""Other Outpatie…",,"""75% of BC""","""Accountable He…","""payer""",,"""75""","""percent of tot…",,,,,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92928""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92929""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92933""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92934""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92937""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92938""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92941""",,
"""Angioplasty""","""CPT/HCPC 92928…","""$17,586.40""","""Aetna""","""payer""","""17586.40""",,,,,"""92943""",,


In [None]:
# def move_modifiers(df):
#     df = df.with_columns([
#         pl.col('hcpcs_cpt').str.slice(offset = 0, length = 5).keep_name(),
#         pl.col('hcpcs_cpt').str.slice(offset = 5, length = None).alias('modifiers'),
#     ])
#     return df

# def split_modifier_string(s):
#     chunks = [s[i:i+2] for i in range(0, len(s), 2)]
#     return "|".join(chunks)

# def split_modifier(df):
#     df = df.with_columns(
#         pl.col('modifiers').apply(split_modifier_string)
#     )
    
#     return df