In [1]:
import polars as pl

In [93]:
url = 'https://core.secure.ehc.com/\
src/util/detail-price-list/\
050631189_hca-houston-pearland_standardcharges.csv'
df = pl.read_csv(url, skip_rows = 1, infer_schema_length = 0)

In [94]:
def get_upper_chunk(df):
    """
    Get just the upper part of the dataframe
    with the HCPCS/CPT codes
    """
    
    df = df.with_row_count()

    for row in df.iter_rows(named = True):
        if not row['Description']:
            end_row = row['row_nr']
            break
    
    df = df.drop('row_nr').slice(0, end_row)
    
    return df

In [95]:
def get_lower_chunks(df):
    """
    Return the lower section(s) as an array
    of dataframe chunks
    """
    
    df = df.with_row_count()
    
    slices = []
    
    for row in df.iter_rows():
        
        if row[3] == 'Rate':
            start = row[0] - 1
            
        if not any(row[1:]):
            end = row[0]
            slices.append((start, end))
            start, end = None, None
            
        if row[1] == 'Service':
            end = row[0]
            slices.append((start, end))
            
            start, end = row[0], None
        
    slices.append((start, row[0] + 1))
        
    chunks = []
    for slice_ in slices:
        s, e = slice_
        chunks.append(df.slice(s, e - s).drop('row_nr'))
        
    return chunks

In [96]:
upper_chunk = get_upper_chunk(df)
lower_chunks = get_lower_chunks(df)

In [97]:
upper_chunk.sample(5)

Procedure ID,HCPCS/CPT Code,Description,Gross Charge,Discounted Cash Price (Gross Charges)
str,str,str,str,str
"""995324""",""" …","""WICK EAR POPE …","""94.00""","""94.00"""
"""101837""","""0C1876 …","""STENT VASC 6X1…","""8495.00""","""8495.00"""
"""901027""","""0J0330 …","""SUCCINYLCHOL U…","""29.43""","""29.43"""
"""956967""",""" …","""SUTURE SILK SZ…","""45.00""","""45.00"""
"""300171""","""0C1713 …","""PLATE HAND PRF…","""3737.00""","""3737.00"""


### Handling the upper chunk

In [98]:
upper_chunk.filter(pl.col('Procedure ID') == '999263')

Procedure ID,HCPCS/CPT Code,Description,Gross Charge,Discounted Cash Price (Gross Charges)
str,str,str,str,str
"""999263""","""094660 …","""CPAP VENT INT&…","""4931.30""","""4931.30"""


In [99]:
upper_chunk = (
    upper_chunk.rename({
        'Procedure ID':'local_code',
        'HCPCS/CPT Code':'hcpcs_cpt',
        'Description':'description'})
    .with_columns(
        # remove extra spaces and leading zeros
        pl.col('hcpcs_cpt').str.strip().str.replace('^0', '')
    )
    .with_columns(
        pl.when(pl.col('hcpcs_cpt').str.lengths() == 0).then(None)
        .otherwise(pl.col('hcpcs_cpt')).keep_name()
    )
    .melt(['local_code', 'hcpcs_cpt', 'description'], 
        value_name = 'standard_charge', 
        variable_name = 'payer_name'
    )
    .with_columns(
        pl.when(pl.col('payer_name') == 'Gross Charge').then('gross')
        .when(pl.col('payer_name') == 'Discounted Cash Price (Gross Charges)').then('cash')
        .alias('payer_category')
    )
)

In [100]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category
str,str,str,str,str,str
"""101505""","""C2623""","""CATH BLN 5F 4X…","""Discounted Cas…","""9747.00""","""cash"""
"""300347""","""C1713""","""PLATE BN 8HL 1…","""Discounted Cas…","""2945.00""","""cash"""
"""823295""",,"""DIVALPROEX 500…","""Gross Charge""","""22.89""","""gross"""
"""302310""","""82310""","""CALCIUM TOTAL …","""Gross Charge""","""615.85""","""gross"""
"""100961""","""C1713""","""STAPLE BN 20X2…","""Discounted Cas…","""11584.00""","""cash"""
"""101487""",,"""SIZER BREAST 3…","""Gross Charge""","""1052.00""","""gross"""
"""600735""","""C1776""","""CUP ACTB PNCL …","""Discounted Cas…","""7719.00""","""cash"""
"""999418""",,"""REM S/D ICD EL…","""Discounted Cas…","""14980.96""","""cash"""
"""601094""","""C1776""","""STEM FEM 15MM …","""Discounted Cas…","""7565.00""","""cash"""
"""288011""",,"""TIB/PER REVASC…","""Gross Charge""","""104444.89""","""gross"""


In [101]:
upper_chunk = upper_chunk.with_columns(
    pl.col('hcpcs_cpt').str.slice(0,5).keep_name(),
    pl.when(pl.col('hcpcs_cpt').str.lengths() > 6).then(pl.col('hcpcs_cpt').str.slice(5,)).alias('modifiers')
)

In [102]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category,modifiers
str,str,str,str,str,str,str
"""621461""",,"""TIP DLV NDL 12…","""Discounted Cas…","""1127.00""","""cash""",
"""605438""",,"""CUP OCCLDR 2.5…","""Discounted Cas…","""1507.00""","""cash""",
"""753982""",,"""NEEDLE BIOPSY …","""Gross Charge""","""1084.00""","""gross""",
"""992853""",,"""CATH CONNCTR U…","""Discounted Cas…","""329.00""","""cash""",
"""834160""","""J2930""","""METHYLPRED SUC…","""Discounted Cas…","""235.44""","""cash""",
"""601747""","""C1713""","""SCREW BN T10 3…","""Discounted Cas…","""766.00""","""cash""",
"""601540""","""C1887""","""CATH GD XB4.5 …","""Discounted Cas…","""1052.00""","""cash""",
"""904736""",,"""CATHETER XTRN …","""Discounted Cas…","""965.00""","""cash""",
"""620193""",,"""DEVICE SUT CPT…","""Gross Charge""","""4288.00""","""gross""",
"""800358""","""C1776""","""INS TIB 3X8 LT…","""Gross Charge""","""4968.00""","""gross""",


### Handling the lower chunks

In [104]:
new_lower_chunks = []
for chunk in lower_chunks[:-1]:
    chunk = (
        chunk
        .drop(['Gross Charge', 'Discounted Cash Price (Gross Charges)'])
        # the payer name is always the first field in the Procedure ID col
        .with_columns([
            pl.lit(chunk['Procedure ID'][0]).alias('payer_name'),
            pl.lit('payer').alias('payer_category')
        ])
        .rename({
            'Procedure ID':'description',
            'HCPCS/CPT Code':'code',
            'Description':'standard_charge_'
        })
        .slice(2,)
    )
    
    new_lower_chunks.append(chunk)

In [107]:
new_lower_chunks.append(
    lower_chunks[-1]
    .drop('Discounted Cash Price (Gross Charges)')
    .slice(1,)
    .rename({
        'Procedure ID':'description',
        'HCPCS/CPT Code':'code',
        'Description':'Min',
        'Gross Charge':'Max',
    })
    .melt(['description', 'code'], variable_name = 'payer_name', value_name = 'standard_charge_')
    .with_columns(
        pl.when(pl.col('payer_name') == 'Min').then('min')
        .when(pl.col('payer_name') == 'Max').then('max')
        .alias('payer_category')
    )
    .select(['description', 'code', 'standard_charge_', 'payer_name', 'payer_category'])
)

In [108]:
lower_chunk = pl.concat(new_lower_chunks)

In [111]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category
str,str,str,str,str
"""Behavioral Hea…",,"""$506.00 ""","""BCBS HMO""","""payer"""
"""Cardiovascular…","""MS-DRG 228""","""$84,037.67 ""","""United""","""payer"""
"""Rehab""",,"""98% of CMG""","""Max""","""max"""
"""Oncology""","""MS-DRG 658""","""$16,322.33""","""Max""","""max"""
"""Lap Band""","""CPT/HCPC 43644…","""$11,681.00 ""","""Friday Health""","""payer"""
"""Diagnostic Ima…",,"""324.67% of FS""","""Cigna""","""payer"""
"""Rad Therapy""",,"""1117% of FS""","""Humana""","""payer"""
"""Gamma Knife""","""CPT/HCPC 61781…","""$40,178.00""","""Max""","""max"""
"""Cardiology""","""APR-DRG 160-16…","""108% of MCD""","""Superior HP MC…","""payer"""
"""Other Inpatien…",,"""105% of MCR""","""Superior HP MC…","""payer"""


In [112]:
def charge_cols():
    
    standard_charge = pl.col('standard_charge_')
    
    percent_bc  = standard_charge.str.contains('% of BC|% of Billable Gross Charges')
    percent_mcr = standard_charge.str.contains('% of MCR')
    percent_fs  = standard_charge.str.contains('% of FS')
    percent_mcd = standard_charge.str.contains('% of MCR')
    
    description    = pl.col('description')
    
    per_diem    = description.str.to_lowercase().str.contains('per diem')
    
    standard_charge = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd).then(None)
        .otherwise(standard_charge.str.replace('\$', '').str.replace_all(',', '').str.strip())
    ).alias('standard_charge')
    
    standard_charge_percent = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd)
        .then(standard_charge.str.extract('(\d+(?:\.\d+)?)%'))
    ).alias('standard_charge_percent')
    
    contracting_method = (
        pl.when(percent_bc).then('percent of total billed charge')
        .when(percent_mcr|percent_fs|percent_mcd).then('other')
        .when(per_diem).then('per diem')
    ).alias('contracting_method')
    
    additional_generic_notes = (
        pl.when(percent_mcr).then('billed as % of MCR')
        .when(percent_fs).then('billed as % of FS')
        .when(percent_mcd).then('billed as % of MCD')
    ).alias('additional_generic_notes')
    
    return standard_charge, standard_charge_percent, contracting_method, additional_generic_notes

In [113]:
lower_chunk.with_columns(*charge_cols()).to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes
0,Additional days,"MS-DRG 768, 796-798, 805-807","$3,580.38",Aetna,payer,3580.38,,,
1,Additional days,MS-DRG 783-788,"$3,580.38",Aetna,payer,3580.38,,,
2,Angioplasty,CPT/HCPC 33967,"$14,536.92",Aetna,payer,14536.92,,,
3,Angioplasty,"CPT/HCPC 92920-92921, 92924-92925, 92928-92929...","$21,113.92",Aetna,payer,21113.92,,,
4,Cardiac Cath,"CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 04...","$12,188.46",Aetna,payer,12188.46,,,
...,...,...,...,...,...,...,...,...,...
1713,Unlisted Grouper,,"$14,000.00",Max,max,14000.00,,,
1714,Urgent Care,,$241.00,Max,max,241.00,,,
1715,Urinary System,"CPT/HCPC 50080, 50081, 50543, 50544, 50590, 51...","$13,686.00",Max,max,13686.00,,,
1716,Urology,"MS-DRG 659-661, 693, 694, 707, 708; ICD 9/10 0...","$41,476.00",Max,max,41476.00,,,


### Extracting with multiple identifiers
Testing to see if any rows contain multiple identifiers

In [114]:
identifiers = ['CPT/HCPC', 'REV', 'MS-DRG', 'ICD 9/10', 'APC', 'APR-DRG']
lower_chunk.with_columns(
    pl.col('code').str.to_uppercase().apply(lambda x: sum([y in x for y in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1713,Urology,"MS-DRG 659-661, 693, 694, 707, 708; ICD 9/10 0...","$41,476.00",Max,max,2.0
1714,Obstetrics,"MS-DRG 765-768, 774, 775, 783-788, 795-798, 80...",$680.00,Min,min,3.0
1715,Orthopedic,"MS-DRG 467-470, 495-499; CPT/HCPC 23470, 23472...","$1,580.00",Min,min,3.0
1716,Obstetrics,"MS-DRG 765-768, 774, 775, 783-788, 795-798, 80...","$12,724.85",Max,max,3.0


### Cleaning up the code column

In [115]:
lower_chunk.with_columns(
    # some codes have junk like \n in them
    pl.col('code').str.replace_all('\n', '').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1749,Ultrasound,"CPT/HCPC 75989, 76376, 76377, 76506, 76510-765...","$4,099.50",Max,max,1.0
1750,Urinary System,"CPT/HCPC 50080, 50081, 50543, 50544, 50590, 51...","$13,686.00",Max,max,1.0
1751,Urology,"MS-DRG 659-661, 693, 694, 707, 708","$41,476.00",Max,max,1.0
1752,Urology,"ICD 9/10 0TF3XZZ, 0TF4XZZ, 0TF6XZZ, 0TF7XZZ, ...","$41,476.00",Max,max,1.0


In [116]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][0]

'CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 0413T, 0415T, 33340, 33477, C9741, 93451- 93462, 93503-93505, 93530-93533, 93590, 93591, 93592, G0448'

In [117]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][1]

"CPT/HCPC Cigna 2020 Grouper - lam and Lap Chole listed in Cigna's grouper schedule assigned to Grouper 9.  Groupers 79 and 99 removed from the grouper schedule."

We'll want to replace things like `93451- 93462` with the appropriate range. A range is when you have two values with the same length connected by either a dash or a dash + space.

We also have rows like

```
'ICD 9/10 27125, 27130, 27132, 27134, 27137, 27138, 27445, 27446, 27447, 27486, 27487'
```
that are totally mislabeled.

All rows where `code` is not null have an identifier:

In [118]:
lower_chunk.with_columns(
    pl.col('code').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).filter(pl.col('id_ct').is_null()).filter(pl.col('code').is_not_null())

description,code,standard_charge_,payer_name,payer_category,id_ct
str,str,str,str,str,i64


In [119]:
lower_chunk = (
    lower_chunk
    .with_columns(pl.col('code').str.replace_all('\n', '').str.split(';'))
    .explode('code')
    .with_columns(pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct'))
)

### Putting codes in their proper columns

In [120]:
col_map = {
    'CPT/HCPC|HCPC Codes':'hcpcs_cpt',
    'REV':'rev_code',
    'MS-DRG':'ms_drg',
    'ICD 9/10':'icd',
    'APC':'apc',
    'APR-DRG':'apr_drg',
    'CMG':'cmg', # for good measure
}

In [121]:
for k, v in col_map.items():
    lower_chunk = lower_chunk.with_columns(
        pl.when(pl.col('code').str.contains(k)).then(pl.col('code').str.replace_all(k, '').str.strip()).alias(v)
    )

In [122]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg
str,str,str,str,str,i64,str,str,str,str,str,str,str
"""Neurosurgery""","""MS-DRG 027""","""$49,896.67 ""","""United""","""payer""",1.0,,,"""027""",,,,
"""Medicine""","""CPT/HCPC 91037…","""$1,031.16 ""","""Cigna""","""payer""",1.0,"""91037""",,,,,,
"""Medicine""","""CPT/HCPC 96102…","""$754.16 ""","""Cigna""","""payer""",1.0,"""96102""",,,,,,
"""Female Genital…","""CPT/HCPC 58541…","""$15,854.00 ""","""Evry Health""","""payer""",1.0,"""58541-58544, 5…",,,,,,
"""Neonate""","""MS-DRG 789-795…","""$436.00""","""Min""","""min""",1.0,,,"""789-795""",,,,
"""Sleep Studies""","""CPT/HCPC 95782…","""$2,673.00""","""Max""","""max""",1.0,"""95782-95783, 9…",,,,,,
"""Cardiovascular…","""MS-DRG 229""","""$91,388.33 ""","""Cigna""","""payer""",1.0,,,"""229""",,,,
"""Other Outpatie…",,"""100% of MCR""","""HUMANA MCR PPO…","""payer""",,,,,,,,
"""Cardiovascular…","""MS-DRG 216-228…","""$13,058.00 ""","""BCBS HMO""","""payer""",1.0,,,"""216-228, 231, …",,,,
"""Medicine""","""CPT/HCPC 93325…","""$423.03 ""","""Cigna""","""payer""",1.0,"""93325""",,,,,,


In [123]:
start = '(^|\\b)'
end = '($|\\b)'

def cpt_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    modifier_grp = f'{start}{pat}-\w{{2}}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({modifier_grp})|({singlet_grp}))'

def other_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({singlet_grp}))'

In [124]:
cpt_pats = ['[A-Z]\d{4}',  #A1234
            '\d{4}[A-Z]',  #1234A
            '\d{5}',       #12345
]

icd_pats = [
    '\w{7}',            # any 7 char sequence TODO
    '\d{3}\.?\d{0,2}',  # ICD9: 123, 123.1, 123.12
    'E\d{3}\.?\d?',     # ICD9: E123, E123.1
    'V\d{2}\.?\d{0,2}', # ICD9: V12, V12.123
]

cpt_pat = '|'.join(cpt_pats)
icd_pat = '|'.join(icd_pats)
ms_drg_pat = '\d{3}x?'
apr_drg_pat = '\d{3}(\d|x)?'
apc_pat = '\d{2,4}'
rev_pat = '\d{3,4}'

cpt_template = '|'.join(cpt_capture_template(pat) for pat in cpt_pats)
icd_template = '|'.join(other_capture_template(pat) for pat in icd_pats)
ms_drg_template = other_capture_template(ms_drg_pat)
apr_drg_template = other_capture_template(apr_drg_pat)
apc_template = other_capture_template(apc_pat)
rev_template = other_capture_template(rev_pat)

In [125]:
def split_through(colname, pat):
    return (
        pl.col(colname).str.extract(f'-({pat})$').alias(f'thru_{colname}'),
        pl.col(colname).str.extract(f'^({pat})-?').alias(colname)
    )

In [126]:
(
    lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    )
    .select([pl.exclude('^thru_.*$')])

).sample(20)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg,modifiers,thru
str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""Laparoscopic""","""CPT/HCPC 43289…","""$10,131.33""","""Max""","""max""",1.0,"""58541""",,,,,,,,"""58546"""
"""Chemotherapy D…","""CPT/HCPC A9513…","""41.35% of BC""","""Max""","""max""",1.0,"""J9035""",,,,,,,,
"""Behavioral Hea…",,"""$386.00 ""","""BCBS Blue Adva…","""payer""",,,,,,,,,,
"""PT/OT/ST""","""CPT/HCPC 92507…","""634.58% of FS""","""Aetna""","""payer""",1.0,"""92507""",,,,,,,,"""92508"""
"""Trauma""",,"""28% of BC""","""Friday Health""","""payer""",,,,,,,,,,
"""Neurosurgery""","""CPT/HCPC 22532…","""$35,000.00 ""","""Evry Health""","""payer""",1.0,"""22610""",,,,,,,,
"""Other Surgical…","""CPT/HCPC 15002…","""$3,475.00 ""","""Molina MCR ""","""payer""",1.0,"""69631""",,,,,,,,"""69676"""
"""Orthopedics""",""" CPT/HCPC , 1…","""$8,313.67""","""Min""","""min""",1.0,"""23472""",,,,,,,,"""23474"""
"""Male Genital S…","""CPT/HCPC 54400…","""$21,079.33""","""Min""","""min""",1.0,"""54405""",,,,,,,,
"""Angioplasty""","""CPT/HCPC 92928…","""$25,132.00 ""","""United""","""payer""",1.0,"""92937""",,,,,,,,"""92938"""


### Putting it all together

In [127]:
lower_chunk = (
  lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    ).drop(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])
    .with_columns(
        *charge_cols()
    )
)

In [128]:
for c in lower_chunk.columns + upper_chunk.columns:
    if c not in upper_chunk.columns:
        dtype = lower_chunk[c].dtype
        upper_chunk = upper_chunk.with_columns(pl.lit(None).cast(dtype).alias(c))
        
    if c not in lower_chunk.columns:
        dtype = upper_chunk[c].dtype
        lower_chunk = lower_chunk.with_columns(pl.lit(None).cast(dtype).alias(c))
    
upper_chunk = upper_chunk.select(lower_chunk.columns)
df = pl.concat([upper_chunk, lower_chunk])

In [129]:
df = df.select([
    'description',
    'local_code',
    'code',
    'hcpcs_cpt',
    'modifiers',
    'rev_code',
    'ms_drg',
    'apr_drg',
    'apc',
    'icd',
    'thru',
    'payer_name',
    'payer_category',
    'standard_charge_',
    'standard_charge',
    'standard_charge_percent',
    'contracting_method',
    'additional_generic_notes',
    ])

In [142]:
def split_modifier_string(s):
    chunks = [s[i:i+2] for i in range(0, len(s), 2)]
    return "|".join(chunks)

In [143]:
df = df.with_columns(
    pl.col('modifiers').apply(split_modifier_string).alias('modifiers')
)

In [145]:
df.write_csv('hca_out.csv')