In [1]:
import polars as pl

In [2]:
url = 'https://core.secure.ehc.com/\
src/util/detail-price-list/\
050631189_hca-houston-pearland_standardcharges.csv'
df = pl.read_csv(url, skip_rows = 1, infer_schema_length = 0)

In [3]:
def get_upper_chunk(df):
    """
    Get just the upper part of the dataframe
    with the HCPCS/CPT codes
    """
    
    df = df.with_row_count()

    for row in df.iter_rows(named = True):
        if not row['Description']:
            end_row = row['row_nr']
            break
    
    df = df.drop('row_nr').slice(0, end_row)
    
    return df

In [4]:
def get_lower_chunks(df):
    """
    Return the lower section(s) as an array
    of dataframe chunks
    """
    
    df = df.with_row_count()
    
    slice_idxs = []
    for row in df.iter_rows():
        if row[1] == 'Service Description':
            slice_idxs.append(row[0] - 1)
            
    slices = [(s, t-1) for s, t in zip(slice_idxs, slice_idxs[1:])]
    
    chunks = []
    for slice_ in slices:
        chunk = df.slice(slice_[0], slice_[1] - slice_[0])
        chunk = chunk.drop('row_nr')
        chunks.append(chunk)
        
    return chunks

In [5]:
upper_chunk = get_upper_chunk(df)
lower_chunks = get_lower_chunks(df)

In [6]:
upper_chunk.sample(5)

Procedure ID,HCPCS/CPT Code,Description,Gross Charge,Discounted Cash Price (Gross Charges)
str,str,str,str,str
"""981056""","""073610LT …","""XR ANKLE 3 + V…","""1434.44""","""1434.44"""
"""602016""","""0C1817 …","""OCLDR HLX ASD …","""1083.00""","""1083.00"""
"""102248""",""" …","""DEVICE WNDCLS …","""1052.00""","""1052.00"""
"""601252""","""0C1725 …","""CATH BLN 6X200…","""1934.00""","""1934.00"""
"""861180""",""" …","""CALCITONIN NAS…","""307.38""","""307.38"""


### Handling the upper chunk

In [7]:
upper_chunk = (
    upper_chunk.rename({
        'Procedure ID':'local_code',
        'HCPCS/CPT Code':'hcpcs_cpt',
        'Description':'description'})
    .with_columns(
        # remove extra spaces and leading zeros
        pl.col('hcpcs_cpt').str.strip().str.replace('^0', '')
    )
    .with_columns(
        pl.when(pl.col('hcpcs_cpt').str.lengths() == 0).then(None)
        .otherwise(pl.col('hcpcs_cpt')).keep_name()
    )
    .melt(['local_code', 'hcpcs_cpt', 'description'], 
        value_name = 'standard_charge', 
        variable_name = 'payer_name'
    )
    .with_columns(
        pl.when(pl.col('payer_name') == 'Gross Charge').then('gross')
        .when(pl.col('payer_name') == 'Discounted Cash Price (Gross Charges)').then('cash')
        .alias('payer_category')
    )
)

In [8]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category
str,str,str,str,str,str
"""621686""",,"""CATH ANGIO 5FR…","""Discounted Cas…","""1507.00""","""cash"""
"""800695""","""C1713""","""SCREW BNE 3.5X…","""Gross Charge""","""490.00""","""gross"""
"""259005""","""70545""","""MRA HD W/CONTR…","""Discounted Cas…","""10120.65""","""cash"""
"""622594""",,"""DRSG WND 5LYR …","""Gross Charge""","""191.00""","""gross"""
"""100583""","""96376""","""IVP EA ADD SEQ…","""Discounted Cas…","""815.32""","""cash"""
"""5682""","""Q9967""","""LOCM 300-399 I…","""Discounted Cas…","""26.16""","""cash"""
"""622673""","""C1874""","""STENT COR 3.00…","""Gross Charge""","""7040.00""","""gross"""
"""903339""","""C1725""","""CATH BLDL 5X15…","""Discounted Cas…","""1961.00""","""cash"""
"""622392""",,"""NEEDLE IO 45MM…","""Discounted Cas…","""1354.00""","""cash"""
"""153""",,"""CATH ANGIO .04…","""Discounted Cas…","""883.00""","""cash"""


In [9]:
upper_chunk = upper_chunk.with_columns(
    pl.col('hcpcs_cpt').str.slice(0,5).keep_name(),
    pl.when(pl.col('hcpcs_cpt').str.lengths() > 6).then(pl.col('hcpcs_cpt').str.slice(5,)).alias('modifiers')
)

In [10]:
upper_chunk.sample(10)

local_code,hcpcs_cpt,description,payer_name,standard_charge,payer_category,modifiers
str,str,str,str,str,str,str
"""102067""",,"""SLING PELVIC O…","""Gross Charge""","""1425.00""","""gross""",
"""620082""",,"""WIRE FX 2.5MM …","""Discounted Cas…","""1052.00""","""cash""",
"""300345""","""C1713""","""PLATE BN ORTH …","""Discounted Cas…","""2945.00""","""cash""",
"""900009""",,"""ENDO ROOM ADD …","""Discounted Cas…","""6596.68""","""cash""",
"""200461""","""C1876""","""STENT VASC 6X4…","""Discounted Cas…","""6950.00""","""cash""",
"""622592""",,"""DRSG WND 5LYR …","""Discounted Cas…","""112.00""","""cash""",
"""836""",,"""CTH SET RADLXA…","""Gross Charge""","""274.00""","""gross""",
"""101723""",,"""SYS SUTLASSO M…","""Gross Charge""","""1893.00""","""gross""",
"""61047""","""C1713""","""SCREW BN 3.5MM…","""Gross Charge""","""264.00""","""gross""",
"""621292""","""C2617""","""STNT URET SIL …","""Discounted Cas…","""1472.00""","""cash""",


### Handling the lower chunks

In [11]:
new_lower_chunks = []
for chunk in lower_chunks:
    chunk = (
        chunk
        .drop(['Gross Charge', 'Discounted Cash Price (Gross Charges)'])
        # the payer name is always the first field in the Procedure ID col
        .with_columns([
            pl.lit(chunk['Procedure ID'][0]).alias('payer_name'),
            pl.lit('payer').alias('payer_category')
        ])
        .rename({
            'Procedure ID':'description',
            'HCPCS/CPT Code':'code',
            'Description':'standard_charge_'
        })
        .slice(2,)
    )
    new_lower_chunks.append(chunk)

In [12]:
lower_chunk = pl.concat(new_lower_chunks)

In [13]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category
str,str,str,str,str
"""Neonate""",,"""$847.00 ""","""Humana""","""payer"""
"""Orthopedics""","""MS-DRG 504""","""$17,015.67 ""","""United""","""payer"""
"""Drug""",,"""304% of FS""","""BCBS HMO""","""payer"""
"""General Surger…","""MS-DRG 927-929…","""35.8% of BC""","""BCBS Blue Adva…","""payer"""
"""OP Other""","""CPT/HCPC 80000…","""216% of FS""","""Healthcare Hig…","""payer"""
"""Observation """,,"""19% of BC""","""Molina MCD""","""payer"""
"""Neonate""","""MS-DRG 795""","""$775.00 ""","""United""","""payer"""
"""Obstetrics""",,"""$3,824.00 ""","""Superior HP MC…","""payer"""
"""Observation """,,"""$183.00 ""","""Kelsey Seybold…","""payer"""
"""Cardiology""","""MS-DRG 246""","""$32,836.33 ""","""United""","""payer"""


In [14]:
def charge_cols():
    
    standard_charge = pl.col('standard_charge_')
    
    percent_bc  = standard_charge.str.contains('% of BC|% of Billable Gross Charges')
    percent_mcr = standard_charge.str.contains('% of MCR')
    percent_fs  = standard_charge.str.contains('% of FS')
    percent_mcd = standard_charge.str.contains('% of MCR')
    
    description    = pl.col('description')
    
    per_diem    = description.str.to_lowercase().str.contains('per diem')
    
    standard_charge = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd).then(None)
        .otherwise(standard_charge.str.replace('\$', '').str.replace_all(',', '').str.strip())
    ).alias('standard_charge')
    
    standard_charge_percent = (
        pl.when(percent_bc|percent_mcr|percent_fs|percent_mcd)
        .then(standard_charge.str.extract('(\d+(?:\.\d+)?)%'))
    ).alias('standard_charge_percent')
    
    contracting_method = (
        pl.when(percent_bc).then('percent of total billed charge')
        .when(percent_mcr|percent_fs|percent_mcd).then('other')
        .when(per_diem).then('per diem')
    ).alias('contracting_method')
    
    additional_generic_notes = (
        pl.when(percent_mcr).then('billed as % of MCR')
        .when(percent_fs).then('billed as % of FS')
        .when(percent_mcd).then('billed as % of MCD')
    ).alias('additional_generic_notes')
    
    return standard_charge, standard_charge_percent, contracting_method, additional_generic_notes

In [15]:
lower_chunk.with_columns(*charge_cols()).to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes
0,Additional days,"MS-DRG 768, 796-798, 805-807","$3,580.38",Aetna,payer,3580.38,,,
1,Additional days,MS-DRG 783-788,"$3,580.38",Aetna,payer,3580.38,,,
2,Angioplasty,CPT/HCPC 33967,"$14,536.92",Aetna,payer,14536.92,,,
3,Angioplasty,"CPT/HCPC 92920-92921, 92924-92925, 92928-92929...","$21,113.92",Aetna,payer,21113.92,,,
4,Cardiac Cath,"CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 04...","$12,188.46",Aetna,payer,12188.46,,,
...,...,...,...,...,...,...,...,...,...
1351,ER,,$695.00,WellCare TX GCD,payer,695.00,,,
1352,Inpatient Services Add on Per diem,,$650.00,WellCare TX GCD,payer,650.00,,per diem,
1353,Observation,,"$4,150.00",WellCare TX GCD,payer,4150.00,,,
1354,Other Outpatient,,103% of MCR,WellCare TX GCD,payer,,,other,billed as % of MCR


### Extracting with multiple identifiers
Testing to see if any rows contain multiple identifiers

In [16]:
identifiers = ['CPT/HCPC', 'REV', 'MS-DRG', 'ICD 9/10', 'APC', 'APR-DRG']
lower_chunk.with_columns(
    pl.col('code').str.to_uppercase().apply(lambda x: sum([y in x for y in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1351,Obstetrics,"MS-DRG 817-819, 831-833; ICD 9/10 04700, 04701...","$1,793.00",United,payer,2.0
1352,Transplant,"MS-DRG 001; ICD 9/10 02HA0QZ, 02HA0RZ, 02HA3QZ...","$151,648.67",United,payer,2.0
1353,Transplant,"MS-DRG 002; ICD 9/10 02HA0QZ, 02HA0RZ, 02HA3QZ...","$151,648.67",United,payer,2.0
1354,Urology,"MS-DRG 694; ICD 9/10 0TF3XZZ, 0TF4XZZ, 0TF6XZZ...","$8,676.00",United,payer,2.0


### Cleaning up the code column

In [17]:
lower_chunk.with_columns(
    # some codes have junk like \n in them
    pl.col('code').str.replace_all('\n', '').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).sort('id_ct').to_pandas()

Unnamed: 0,description,code,standard_charge_,payer_name,payer_category,id_ct
0,Detoxification,,"$1,889.31",Aetna,payer,
1,Grouper 1,,"$2,011.23",Aetna,payer,
2,Grouper 2,,"$2,187.31",Aetna,payer,
3,Grouper 3,,"$2,741.00",Aetna,payer,
4,Grouper 4,,"$2,976.85",Aetna,payer,
...,...,...,...,...,...,...
1359,Vascular Surgery,MS-DRG 035,"$23,499.67",United,payer,1.0
1360,Vascular Surgery,MS-DRG 254,"$21,713.67",United,payer,1.0
1361,Vascular Surgery,MS-DRG 263,"$41,552.33",United,payer,1.0
1362,Vascular Surgery,MS-DRG 036,"$23,499.67",United,payer,1.0


In [18]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][0]

'CPT/HCPC 0293T, 0294T, 0408T, 0410T, 0411T, 0413T, 0415T, 33340, 33477, C9741, 93451- 93462, 93503-93505, 93530-93533, 93590, 93591, 93592, G0448'

In [19]:
lower_chunk.filter(pl.col('code').str.contains('- ')).to_pandas()['code'][1]

"CPT/HCPC Cigna 2020 Grouper - lam and Lap Chole listed in Cigna's grouper schedule assigned to Grouper 9.  Groupers 79 and 99 removed from the grouper schedule."

We'll want to replace things like `93451- 93462` with the appropriate range. A range is when you have two values with the same length connected by either a dash or a dash + space.

We also have rows like

```
'ICD 9/10 27125, 27130, 27132, 27134, 27137, 27138, 27445, 27446, 27447, 27486, 27487'
```
that are totally mislabeled.

All rows where `code` is not null have an identifier:

In [20]:
lower_chunk.with_columns(
    pl.col('code').str.split(';')
).explode('code').with_columns(
    pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct')
).filter(pl.col('id_ct').is_null()).filter(pl.col('code').is_not_null())

description,code,standard_charge_,payer_name,payer_category,id_ct
str,str,str,str,str,i64


In [21]:
lower_chunk = (
    lower_chunk
    .with_columns(pl.col('code').str.replace_all('\n', '').str.split(';'))
    .explode('code')
    .with_columns(pl.col('code').str.to_uppercase().apply(lambda row: sum([id_ in row for id_ in identifiers])).alias('id_ct'))
)

### Putting codes in their proper columns

In [22]:
col_map = {
    'CPT/HCPC|HCPC Codes':'hcpcs_cpt',
    'REV':'rev_code',
    'MS-DRG':'ms_drg',
    'ICD 9/10':'icd',
    'APC':'apc',
    'APR-DRG':'apr_drg',
    'CMG':'cmg', # for good measure
}

In [23]:
for k, v in col_map.items():
    lower_chunk = lower_chunk.with_columns(
        pl.when(pl.col('code').str.contains(k)).then(pl.col('code').str.replace_all(k, '').str.strip()).alias(v)
    )

In [24]:
lower_chunk.sample(10)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg
str,str,str,str,str,i64,str,str,str,str,str,str,str
"""Cardiovascular…","""MS-DRG 232""","""$104,009.00 ""","""Cigna""","""payer""",1.0,,,"""232""",,,,
"""Cardiovascular…","""MS-DRG 274""","""$18,951.00 ""","""Coventry Natio…","""payer""",1.0,,,"""274""",,,,
"""Medicine""","""CPT/HCPC 95961…","""$720.16 ""","""Cigna""","""payer""",1.0,"""95961""",,,,,,
"""Vascular Surge…","""MS-DRG 036""","""$23,499.67 ""","""United""","""payer""",1.0,,,"""036""",,,,
"""Observation """,,"""$5,000.00 ""","""American Healt…","""payer""",,,,,,,,
"""Cardiology""","""MS-DRG 223""","""$81,911.33 ""","""United""","""payer""",1.0,,,"""223""",,,,
"""Medicine""","""CPT/HCPC 95972…","""$512.43 ""","""Cigna""","""payer""",1.0,"""95972""",,,,,,
"""Cardiovascular…","""MS-DRG 233""","""$52,529.33 ""","""United""","""payer""",1.0,,,"""233""",,,,
"""Medicine""","""CPT/HCPC 92977…","""$548.93 ""","""Cigna""","""payer""",1.0,"""92977""",,,,,,
"""Other Outpatie…",,"""106% of MCR""","""Global Health""","""payer""",,,,,,,,


In [25]:
start = '(^|\\b)'
end = '($|\\b)'

def cpt_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    modifier_grp = f'{start}{pat}-\w{{2}}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({modifier_grp})|({singlet_grp}))'

def other_capture_template(pat):
    range_grp = f'{start}{pat}-\s?{pat}{end}'
    singlet_grp = f'{start}{pat}{end}'
    return f'(?:({range_grp})|({singlet_grp}))'

In [26]:
cpt_pats = ['[A-Z]\d{4}',  #A1234
            '\d{4}[A-Z]',  #1234A
            '\d{5}',       #12345
]

icd_pats = [
    '\w{7}',            # any 7 char sequence TODO
    '\d{3}\.?\d{0,2}',  # ICD9: 123, 123.1, 123.12
    'E\d{3}\.?\d?',     # ICD9: E123, E123.1
    'V\d{2}\.?\d{0,2}', # ICD9: V12, V12.123
]

cpt_pat = '|'.join(cpt_pats)
icd_pat = '|'.join(icd_pats)
ms_drg_pat = '\d{3}x?'
apr_drg_pat = '\d{3}(\d|x)?'
apc_pat = '\d{2,4}'
rev_pat = '\d{3,4}'

cpt_template = '|'.join(cpt_capture_template(pat) for pat in cpt_pats)
icd_template = '|'.join(other_capture_template(pat) for pat in icd_pats)
ms_drg_template = other_capture_template(ms_drg_pat)
apr_drg_template = other_capture_template(apr_drg_pat)
apc_template = other_capture_template(apc_pat)
rev_template = other_capture_template(rev_pat)

In [27]:
def split_through(colname, pat):
    return (
        pl.col(colname).str.extract(f'-({pat})$').alias(f'thru_{colname}'),
        pl.col(colname).str.extract(f'^({pat})-?').alias(colname)
    )

In [28]:
(
    lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    ).drop(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])

).sample(20)

description,code,standard_charge_,payer_name,payer_category,id_ct,hcpcs_cpt,rev_code,ms_drg,icd,apc,apr_drg,cmg,modifiers,thru
str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""Radiology""","""CPT/HCPC 0348T…","""952.58% of FS""","""Aetna""","""payer""",1.0,"""0349T""",,,,,,,,
"""Medicine""","""CPT/HCPC 93304…","""$423.03 ""","""Cigna""","""payer""",1.0,"""93304""",,,,,,,,
"""ER""","""CPT/HCPC 99281…","""$2,098.26 ""","""Aetna""","""payer""",1.0,"""99283""",,,,,,,,
"""Medicine""","""CPT/HCPC 92586…","""$314.76 ""","""Cigna""","""payer""",1.0,"""92586""",,,,,,,,
"""Cardiac Cath""","""CPT/HCPC 0293T…","""$12,188.46 ""","""Aetna""","""payer""",1.0,"""C9741""",,,,,,,,
"""Grouper 2""",,"""$4,401.00 ""","""Humana""","""payer""",,,,,,,,,,
"""Hemic/Lymph Sy…","""CPT/HCPC 0312T…","""$5,034.85 ""","""Aetna""","""payer""",1.0,"""58573""",,,,,,,,
"""Lab/Path/Trans…","""CPT/HCPC D0416…","""648.19% of FS""","""Aetna""","""payer""",1.0,"""0001M""",,,,,,,,
"""Orthopedic""","""CPT/HCPC 27702…","""125% of MCR""","""Devoted Health…","""payer""",1.0,"""27125""",,,,,,,,
"""Medicine""","""CPT/HCPC 94250…","""$164.92 ""","""Cigna""","""payer""",1.0,"""94250""",,,,,,,,


### Putting it all together

In [29]:
lower_chunk = (
  lower_chunk.with_columns([
    
        # we need to string capture the CPT codes when they're mislabled as ICDs
        pl.concat_str([
            pl.col('hcpcs_cpt').fill_null(''), pl.col('icd').fill_null('')
        ]).str.extract_all(cpt_template).alias('hcpcs_cpt'),
        
        # remove the extracted CPT codes before we look for ICD codes
        pl.col('icd').str.replace_all(cpt_pat, '').str.extract_all(icd_template).keep_name(),

        # these appear to be OK
        pl.col('ms_drg').str.extract_all(ms_drg_template).keep_name(),
        pl.col('apr_drg').str.extract_all(apr_drg_template).keep_name(),
        pl.col('apc').str.extract_all(apc_template).keep_name(),
        pl.col('rev_code').str.extract_all(rev_template).keep_name(),
    ])
    
    # all these need to be exploded separately since they have different array lengths
    .explode('hcpcs_cpt').explode('ms_drg').explode('icd').explode('apc').explode('apr_drg').explode('rev_code')
    
    .with_columns([
        *split_through('hcpcs_cpt', cpt_pat),
        *split_through('icd', icd_pat),
        *split_through('ms_drg', ms_drg_pat),
        *split_through('apr_drg', apr_drg_pat),
        *split_through('apc', apc_pat),
        *split_through('rev_code', rev_pat),
        pl.col('hcpcs_cpt').str.extract(f'-(\w{{2}})$').alias('modifiers'),
    ])    
    .with_columns(
        pl.coalesce(pl.col(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])).alias('thru')
    ).drop(['thru_hcpcs_cpt', 'thru_rev_code', 'thru_icd', 'thru_ms_drg', 'thru_apr_drg', 'thru_apc'])
    .with_columns(
        *charge_cols()
    )
)

In [30]:
for c in lower_chunk.columns:
    if c not in upper_chunk.columns:
        dtype = lower_chunk[c].dtype
        upper_chunk = upper_chunk.with_columns(pl.lit(None).cast(dtype).alias(c))

upper_chunk = upper_chunk.select(lower_chunk.columns)
df = pl.concat([upper_chunk, lower_chunk])

In [35]:
df = df.select([
    'description',
    'code',
    'hcpcs_cpt',
    'modifiers',
    'rev_code',
    'ms_drg',
    'apr_drg',
    'apc',
    'icd',
    'thru',
    'payer_name',
    'payer_category',
    'standard_charge_',
    'standard_charge',
    'standard_charge_percent',
    'contracting_method',
    'additional_generic_notes',
    ])

In [36]:
df.write_csv('HCAtest.csv')

In [None]:
# def move_modifiers(df):
#     df = df.with_columns([
#         pl.col('hcpcs_cpt').str.slice(offset = 0, length = 5).keep_name(),
#         pl.col('hcpcs_cpt').str.slice(offset = 5, length = None).alias('modifiers'),
#     ])
#     return df

# def split_modifier_string(s):
#     chunks = [s[i:i+2] for i in range(0, len(s), 2)]
#     return "|".join(chunks)

# def split_modifier(df):
#     df = df.with_columns(
#         pl.col('modifiers').apply(split_modifier_string)
#     )
    
#     return df