In [559]:
import polars as pl

In [560]:
url = 'https://core.secure.ehc.com/\
src/util/detail-price-list/\
050631189_hca-houston-pearland_standardcharges.csv'
df = pl.read_csv(url, skip_rows = 1, infer_schema_length = 0)

In [561]:
def get_upper_chunk(df):
    """
    Get just the upper part of the dataframe
    with the HCPCS/CPT codes
    """
    
    df = df.with_row_count()

    for row in df.iter_rows(named = True):
        if not row['Description']:
            end_row = row['row_nr']
            break
    
    df = df.drop('row_nr').slice(0, end_row)
    
    return df

In [590]:
def get_lower_chunks(df):
    """
    Return the lower section(s) as an array
    of dataframe chunks
    """
    
    df = df.with_row_count()
    
    slices = []
    
    for row in df.iter_rows():
        
        if row[3] == 'Rate':
            start = row[0] - 1
            
        if not any(row[1:]):
            end = row[0]
            slices.append((start, end))
            start, end = None, None
            
        if row[1] == 'Service':
            end = row[0]
            slices.append((start, end))
            
            start, end = row[0], None
        
    slices.append((start, row[0]))
        
    chunks = []
    for slice_ in slices:
        s, e = slice_
        chunks.append(df.slice(s, e - s).drop('row_nr'))
        
    return chunks

In [591]:
upper_chunk = get_upper_chunk(df)
lower_chunks = get_lower_chunks(df)

In [592]:
upper_chunk.sample(5)

Procedure ID,HCPCS/CPT Code,Description,Gross Charge,Discounted Cash Price (Gross Charges)
str,str,str,str,str
"""601260""","""0C1725 …","""CATH BLN EVRCR…","""2028.00""","""2028.00"""
"""303735""","""083735 …","""MAGNESIUM BLD …","""865.46""","""865.46"""
"""743333""","""0C1887 …","""CATH GD XB3.5 …","""1052.00""","""1052.00"""
"""319065""","""087081 …","""CULT LEGIONELL…","""888.35""","""888.35"""
"""904012""","""0C1713 …","""SCREW V-AX NLC…","""284.04""","""284.04"""


### Handling the upper chunk

In [593]:
upper_chunk = (
    upper_chunk.rename({
        'Procedure ID':'local_code',
        'HCPCS/CPT Code':'hcpcs_cpt',
        'Description':'description'})
    .with_columns(
        # remove extra spaces and leading zeros
        pl.col('hcpcs_cpt').str.strip().str.replace('^0', '')
    )
    .with_columns(
        pl.when(pl.col('hcpcs_cpt').str.lengths() == 0).then(None)
        .otherwise(pl.col('hcpcs_cpt')).keep_name()
    )
    .melt(['local_code', 'hcpcs_cpt', 'description'], 
        value_name = 'standard_charge', 
        variable_name = 'payer_name'
    )
    .with_columns(
        pl.when(pl.col('payer_name') == 'Gross Charge').then('gross')
        .when(pl.col('payer_name') == 'Discounted Cash Price (Gross Charges)').then('cash')
        .alias('payer_category')
    )
)

In [594]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category
str,str,str,str,str,str
"""100457""","""96375""","""IVP EA ADD SEQ…","""Gross Charge""","""815.32""","""gross"""
"""100747""","""C1769""","""GWIRE BN DISP …","""Discounted Cas…","""1507.00""","""cash"""
"""102569""","""C1813""","""RESERVOIR PENL…","""Discounted Cas…","""18663.00""","""cash"""
"""101221""",,"""MELATONIN 3MG …","""Gross Charge""","""2.18""","""gross"""
"""701879""",,"""PACKING 8CM NS…","""Discounted Cas…","""3380.00""","""cash"""
"""621464""","""C1713""","""PLATE BN ANKL …","""Discounted Cas…","""4320.00""","""cash"""
"""622196""","""C2623""","""CATH BLNDIL 5M…","""Discounted Cas…","""18419.00""","""cash"""
"""601976""","""C1713""","""SCREW BN 2.5X3…","""Discounted Cas…","""3563.00""","""cash"""
"""600820""",,"""SYS SPEC RTRVL…","""Gross Charge""","""1316.00""","""gross"""
"""621666""",,"""BRA POST-OP MD…","""Discounted Cas…","""965.00""","""cash"""


In [595]:
upper_chunk = upper_chunk.with_columns(
    pl.col('hcpcs_cpt').str.slice(0,5).keep_name(),
    pl.when(pl.col('hcpcs_cpt').str.lengths() > 6).then(pl.col('hcpcs_cpt').str.slice(5,)).alias('modifiers')
)

In [596]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category,modifiers
str,str,str,str,str,str,str
"""101298""","""C1713""","""PEG THREAD LCK…","""Discounted Cas…","""749.00""","""cash""",
"""601875""","""C1713""","""SCREW 2.3 22 H…","""Gross Charge""","""744.00""","""gross""",
"""825990""",,"""KETOROLAC TROM…","""Discounted Cas…","""66.49""","""cash""",
"""620178""",,"""FILLER BONE VO…","""Gross Charge""","""7423.00""","""gross""",
"""303557""","""84030""","""NEWBORN SCREEN…","""Discounted Cas…","""220.18""","""cash""",
"""286030""","""92960""","""CARDIOVERSION …","""Gross Charge""","""4651.03""","""gross""",
"""300306""","""C1713""","""PLATE MED STD …","""Discounted Cas…","""1520.00""","""cash""",
"""101930""","""C1713""","""SCREW SPN SD V…","""Gross Charge""","""769.00""","""gross""",
"""101650""",,"""FIXATOR KIT FR…","""Discounted Cas…","""18833.00""","""cash""",
"""672446""",,"""ELECTRODE ESUR…","""Gross Charge""","""101.00""","""gross""",


### Handling the lower chunks

In [597]:
new_lower_chunks = []
for chunk in lower_chunks[:-1]:
    chunk = (
        chunk
        .drop(['Gross Charge', 'Discounted Cash Price (Gross Charges)'])
        # the payer name is always the first field in the Procedure ID col
        .with_columns([
            pl.lit(chunk['Procedure ID'][0]).alias('payer_name'),
            pl.lit('payer').alias('payer_category')
        ])
        .rename({
            'Procedure ID':'description',
            'HCPCS/CPT Code':'code',
            'Description':'standard_charge_'
        })
        .slice(2,)
    )
    new_lower_chunks.append(chunk)

In [598]:
new_lower_chunks.append(
    lower_chunks[-1]
    .drop('Discounted Cash Price (Gross Charges)')
    .slice(1,)
    .rename({
        'Procedure ID':'description',
        'HCPCS/CPT Code':'code',
        'Description':'Min',
        'Gross Charge':'Max',
    })
    .melt(['description', 'code'], variable_name = 'payer_name', value_name = 'standard_charge_')
    .with_columns(
        pl.when(pl.col('payer_name') == 'Min').then('min')
        .when(pl.col('payer_name') == 'Max').then('max')
        .alias('payer_category')
    )
    .select(['description', 'code', 'standard_charge_', 'payer_name', 'payer_category'])
)

In [599]:
new_lower_chunks[6]

description,code,standard_charge_,payer_name,payer_category
str,str,str,str,str
"""ER""",,"""100% of MCR""","""BCBS TX MCR""","""payer"""
"""Observation """,,"""100% of MCR""","""BCBS TX MCR""","""payer"""
"""Other Inpatien…",,"""100% of MCR""","""BCBS TX MCR""","""payer"""
"""Other Outpatie…",,"""100% of MCR""","""BCBS TX MCR""","""payer"""
"""Other Surgical…",,"""100% of MCR""","""BCBS TX MCR""","""payer"""
"""Rehab""",,"""100% of MCR""","""BCBS TX MCR""","""payer"""


In [600]:
lower_chunk = pl.concat(new_lower_chunks)

In [601]:
lower_chunk.filter(pl.col('description').is_null())

description,code,standard_charge_,payer_name,payer_category
str,str,str,str,str


In [602]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category
str,str,str,str,str
"""Grouper 1""","""CPT/HCPC lam, …","""$1,713.00""","""Min""","""min"""
"""Ultrasound""",,"""$185.50 ""","""Oscar""","""payer"""
"""Female Genital…","""CPT/HCPC 55866…","""$5,034.85""","""Min""","""min"""
"""All Outpatient…",,"""169% of MCR""","""Molina Healthc…","""payer"""
"""Sleep Studies""","""CPT/HCPC 95782…","""634.58% of FS""","""Aetna""","""payer"""
"""Lab/Path/Trans…",,"""541.33% of FS""","""Cigna""","""payer"""
"""Neurosurgery""","""CPT/HCPC 22800…","""$40,000.00 ""","""Evry Health""","""payer"""
"""Behavioral Hea…",,"""$554.00 ""","""BCBS PPO""","""payer"""
"""ICU/ CCU/ PICU…",,"""$1,331.00""","""Min""","""min"""
"""Medicine""","""CPT/HCPC 95813…","""$603.08 ""","""Cigna""","""payer"""


In [603]:
def charge_cols():
    
    standard_charge = pl.col('standard_charge_')
    
    percent_bc  = standard_charge.str.contains('% of BC|% of Billable Gross Charges')
    percent_mcr = standard_charge.str.contains('% of MCR')
    percent_fs  = standard_charge.str.contains('% of FS')
    percent_mcd = standard_charge.str.contains('% of MCR')
    
    description    = pl.col('description')
    
    per_diem    = description.str.to_lowercase().str.contains('per diem')
    
    standard_charge = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd).then(None)
        .otherwise(standard_charge.str.replace('\$', '').str.replace_all(',', '').str.strip())
    ).alias('standard_charge')
    
    standard_charge_percent = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd)
        .then(standard_charge.str.extract('(\d+(?:\.\d+)?)%'))
    ).alias('standard_charge_percent')
    
    contracting_method = (
        pl.when(percent_bc).then('percent of total billed charge')
        .when(percent_mcr|percent_fs|percent_mcd).then('other')
        .when(per_diem).then('per diem')
    ).alias('contracting_method')
    
    additional_generic_notes = (
        pl.when(percent_mcr).then('billed as % of MCR')
        .when(percent_fs).then('billed as % of FS')
        .when(percent_mcd).then('billed as % of MCD')
    ).alias('additional_generic_notes')
    
    return standard_charge, standard_charge_percent, contracting_method, additional_generic_notes

In [604]:
lower_chunk.with_columns(*charge_cols()).to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes
0,Additional days,"MS-DRG 768, 796-798, 805-807","$3,580.38",Aetna,payer,3580.38,,,
1,Additional days,MS-DRG 783-788,"$3,580.38",Aetna,payer,3580.38,,,
2,Angioplasty,CPT/HCPC 33967,"$14,536.92",Aetna,payer,14536.92,,,
3,Angioplasty,"CPT/HCPC 92920-92921, 92924-92925, 92928-92929...","$21,113.92",Aetna,payer,21113.92,,,
4,Cardiac Cath,"CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 04...","$12,188.46",Aetna,payer,12188.46,,,
...,...,...,...,...,...,...,...,...,...
1711,Ultrasound,"CPT/HCPC 75989, 76376, 76377, 76506, 76510-765...","$4,099.50",Max,max,4099.50,,,
1712,Unlisted Grouper,,"$14,000.00",Max,max,14000.00,,,
1713,Urgent Care,,$241.00,Max,max,241.00,,,
1714,Urinary System,"CPT/HCPC 50080, 50081, 50543, 50544, 50590, 51...","$13,686.00",Max,max,13686.00,,,


### Extracting with multiple identifiers
Testing to see if any rows contain multiple identifiers

In [605]:
identifiers = ['CPT/HCPC', 'REV', 'MS-DRG', 'ICD 9/10', 'APC', 'APR-DRG']
lower_chunk.with_columns(
    pl.col('code').str.to_uppercase().apply(lambda x: sum([y in x for y in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1711,Urology,"MS-DRG 659-661, 693, 694, 707, 708; ICD 9/10 0...","$41,476.00",Max,max,2.0
1712,Obstetrics,"MS-DRG 765-768, 774, 775, 783-788, 795-798, 80...",$680.00,Min,min,3.0
1713,Orthopedic,"MS-DRG 467-470, 495-499; CPT/HCPC 23470, 23472...","$1,580.00",Min,min,3.0
1714,Obstetrics,"MS-DRG 765-768, 774, 775, 783-788, 795-798, 80...","$12,724.85",Max,max,3.0


### Cleaning up the code column

In [606]:
lower_chunk.with_columns(
    # some codes have junk like \n in them
    pl.col('code').str.replace_all('\n', '').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1747,Transplant,"ICD 9/10 02HA0QZ, 02HA0RZ, 02HA3QZ, 02HA4QZ, ...","$151,648.67",Max,max,1.0
1748,Ultrasound,"CPT/HCPC 75989, 76376, 76377, 76506, 76510-765...","$4,099.50",Max,max,1.0
1749,Urinary System,"CPT/HCPC 50080, 50081, 50543, 50544, 50590, 51...","$13,686.00",Max,max,1.0
1750,Urology,"MS-DRG 659-661, 693, 694, 707, 708","$41,476.00",Max,max,1.0


In [607]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][0]

'CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 0413T, 0415T, 33340, 33477, C9741, 93451- 93462, 93503-93505, 93530-93533, 93590, 93591, 93592, G0448'

In [608]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][1]

"CPT/HCPC Cigna 2020 Grouper - lam and Lap Chole listed in Cigna's grouper schedule assigned to Grouper 9.  Groupers 79 and 99 removed from the grouper schedule."

We'll want to replace things like `93451- 93462` with the appropriate range. A range is when you have two values with the same length connected by either a dash or a dash + space.

We also have rows like

```
'ICD 9/10 27125, 27130, 27132, 27134, 27137, 27138, 27445, 27446, 27447, 27486, 27487'
```
that are totally mislabeled.

All rows where `code` is not null have an identifier:

In [609]:
lower_chunk.with_columns(
    pl.col('code').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).filter(pl.col('id_ct').is_null()).filter(pl.col('code').is_not_null())

description,code,standard_charge_,payer_name,payer_category,id_ct
str,str,str,str,str,i64


In [610]:
lower_chunk = (
    lower_chunk
    .with_columns(pl.col('code').str.replace_all('\n', '').str.split(';'))
    .explode('code')
    .with_columns(pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct'))
)

### Putting codes in their proper columns

In [611]:
col_map = {
    'CPT/HCPC|HCPC Codes':'hcpcs_cpt',
    'REV':'rev_code',
    'MS-DRG':'ms_drg',
    'ICD 9/10':'icd',
    'APC':'apc',
    'APR-DRG':'apr_drg',
    'CMG':'cmg', # for good measure
}

In [612]:
for k, v in col_map.items():
    lower_chunk = lower_chunk.with_columns(
        pl.when(pl.col('code').str.contains(k)).then(pl.col('code').str.replace_all(k, '').str.strip()).alias(v)
    )

In [613]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg
str,str,str,str,str,i64,str,str,str,str,str,str,str
"""Medicine""","""CPT/HCPC 95864…","""$314.76 ""","""Cigna""","""payer""",1.0,"""95864""",,,,,,
"""Grouper 5""",,"""$7,000.00 ""","""Evry Health""","""payer""",,,,,,,,
"""PTCA""","""CPT/HCPC 35450…","""$7,566.00""","""Min""","""min""",1.0,"""35450-35476, 3…",,,,,,
"""Medicine""","""CPT/HCPC 95929…","""$314.76 ""","""Cigna""","""payer""",1.0,"""95929""",,,,,,
"""Table 5""",,"""26.04% of BC""","""United""","""payer""",,,,,,,,
"""Outpatient""",,"""20.67% of BC""","""Superior""","""payer""",,,,,,,,
"""Other Surgical…","""CPT/HCPC 19120…","""$2,369.00 ""","""Superior HP MC…","""payer""",1.0,"""19120, 19125-1…",,,,,,
"""All Other Outp…",,"""$9,470.00 ""","""Cigna""","""payer""",,,,,,,,
"""Rehab""",,"""100% of MCR""","""BCBS TX MCR""","""payer""",,,,,,,,
"""Neurosurgery""","""MS-DRG 457""","""$37,510.00 ""","""United""","""payer""",1.0,,,"""457""",,,,


In [614]:
start = '(^|\\b)'
end = '($|\\b)'

def cpt_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    modifier_grp = f'{start}{pat}-\w{{2}}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({modifier_grp})|({singlet_grp}))'

def other_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({singlet_grp}))'

In [615]:
cpt_pats = ['[A-Z]\d{4}',  #A1234
            '\d{4}[A-Z]',  #1234A
            '\d{5}',       #12345
]

icd_pats = [
    '\w{7}',            # any 7 char sequence TODO
    '\d{3}\.?\d{0,2}',  # ICD9: 123, 123.1, 123.12
    'E\d{3}\.?\d?',     # ICD9: E123, E123.1
    'V\d{2}\.?\d{0,2}', # ICD9: V12, V12.123
]

cpt_pat = '|'.join(cpt_pats)
icd_pat = '|'.join(icd_pats)
ms_drg_pat = '\d{3}x?'
apr_drg_pat = '\d{3}(\d|x)?'
apc_pat = '\d{2,4}'
rev_pat = '\d{3,4}'

cpt_template = '|'.join(cpt_capture_template(pat) for pat in cpt_pats)
icd_template = '|'.join(other_capture_template(pat) for pat in icd_pats)
ms_drg_template = other_capture_template(ms_drg_pat)
apr_drg_template = other_capture_template(apr_drg_pat)
apc_template = other_capture_template(apc_pat)
rev_template = other_capture_template(rev_pat)

In [616]:
def split_through(colname, pat):
    return (
        pl.col(colname).str.extract(f'-({pat})$').alias(f'thru_{colname}'),
        pl.col(colname).str.extract(f'^({pat})-?').alias(colname)
    )

In [617]:
(
    lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    ).drop(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])

).sample(20)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg,modifiers,thru
str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""EP Study""","""CPT/HCPC 93600…","""263% of FS""","""BCBS HMO""","""payer""",1.0,"""93624""",,,,,,,,
"""Lab/Path/Trans…","""CPT/HCPC D0416…","""648.19% of FS""","""Aetna""","""payer""",1.0,"""Q0115""",,,,,,,,
"""ESWL Lithotrip…","""CPT/HCPC 50590…","""$4,450.00 ""","""Molina MCR ""","""payer""",1.0,"""50590""",,,,,,,,
"""Lab/Path/Trans…","""CPT/HCPC 58323…","""648.19% of FS""","""Max""","""max""",1.0,"""G0328""",,,,,,,,
"""Lab/Path/Trans…","""CPT/HCPC 58323…","""100% of FS""","""Min""","""min""",1.0,"""P3001""",,,,,,,,
"""Medicine""",""" CPT/HCPC 9076…","""$25,000.00""","""Max""","""max""",1.0,"""93278""",,,,,,,,
"""Transplant""","""MS-DRG 001, 00…","""35.8% of BC""","""BCBS Blue Adva…","""payer""",1.0,,,"""002""",,,,,,
"""Orthopedic""","""CPT/HCPC 23470…","""$15,467.00 ""","""Oscar""","""payer""",1.0,"""27132""",,,,,,,,
"""EP Studies""","""CPT/HCPC 93600…","""$17,084.00""","""Min""","""min""",1.0,"""93600""",,,,,,,,
"""Orthopedic""","""CPT/HCPC 27702…","""125% of MCR""","""Devoted Health…","""payer""",1.0,"""27130""",,,,,,,,


### Putting it all together

In [618]:
lower_chunk = (
  lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    ).drop(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])
    .with_columns(
        *charge_cols()
    )
)

In [619]:
for c in lower_chunk.columns + upper_chunk.columns:
    if c not in upper_chunk.columns:
        dtype = lower_chunk[c].dtype
        upper_chunk = upper_chunk.with_columns(pl.lit(None).cast(dtype).alias(c))
        
    if c not in lower_chunk.columns:
        dtype = upper_chunk[c].dtype
        lower_chunk = lower_chunk.with_columns(pl.lit(None).cast(dtype).alias(c))
    
upper_chunk = upper_chunk.select(lower_chunk.columns)
df = pl.concat([upper_chunk, lower_chunk])

In [620]:
df = df.select([
    'description',
    'local_code',
    'code',
    'hcpcs_cpt',
    'modifiers',
    'rev_code',
    'ms_drg',
    'apr_drg',
    'apc',
    'icd',
    'thru',
    'payer_name',
    'payer_category',
    'standard_charge_',
    'standard_charge',
    'standard_charge_percent',
    'contracting_method',
    'additional_generic_notes',
    ])

In [621]:
df.filter(pl.col('description').is_null())

description,local_code,code,hcpcs_cpt,modifiers,rev_code,ms_drg,apr_drg,apc,icd,thru,payer_name,payer_category,standard_charge_,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str


In [622]:
df.write_csv('HCAtest.csv')

In [623]:
# def move_modifiers(df):
#     df = df.with_columns([
#         pl.col('hcpcs_cpt').str.slice(offset = 0, length = 5).keep_name(),
#         pl.col('hcpcs_cpt').str.slice(offset = 5, length = None).alias('modifiers'),
#     ])
#     return df

# def split_modifier_string(s):
#     chunks = [s[i:i+2] for i in range(0, len(s), 2)]
#     return "|".join(chunks)

# def split_modifier(df):
#     df = df.with_columns(
#         pl.col('modifiers').apply(split_modifier_string)
#     )
    
#     return df