In [1]:
import polars as pl

In [379]:
path = '74-2730328_northeast-methodist-hospital_standardcharges.csv'

df = pl.read_csv(url, skip_rows = 1, infer_schema_length = 0)

In [380]:
def get_main_df(df):
    df = df.with_row_count()
    
    for row in df.iter_rows(named = True):
        if row['HCPCS/CPT Code'] == 'Coding':
            break_pt = row['row_nr'] - 1
            break
    
    df = df.drop('row_nr')
    
    rename_dict = {
        'Procedure ID':'internal_code',
        'HCPCS/CPT Code':'code_orig',
        'Description':'description',
    }
    
    df = df.rename(rename_dict)
            
    return df[:break_pt]

In [381]:
def get_payer_dfs(df):
    df = df.with_row_count()
    
    slice_ids = []
    for row in df.iter_rows(named = True):
        if row['HCPCS/CPT Code'] == 'Coding':
            slice_ids.append(row['row_nr'] - 1)
            
    slices = [(s, t-1) for s, t in zip(slice_ids, slice_ids[1:])]

    rename_dict = {
        'Procedure ID':'description',
        'HCPCS/CPT Code':'code_orig',
        'Description':'rate',
    }
    
    payer_dfs = []
    for slice_ in slices:
        payer_df = df.slice(slice_[0], slice_[1])
        payer_df = payer_df.rename(rename_dict)
        payer_name = payer_df['description'][0]
        payer_df = payer_df[2:]
        payer_df = payer_df.drop(['Gross Charge', 'Discounted Cash Price (Gross Charges)'])

        payer_df = payer_df.with_columns([
            pl.lit(payer_name).alias('payer_orig'),
            pl.lit('payer').alias('payer_category'),
        ])
        
        payer_df = payer_df.drop('row_nr')
        
        payer_dfs.append(payer_df)
        
    return payer_dfs

In [382]:
df = df.pipe(remove_empty_strings)

In [383]:
def remove_empty_strings(df):
    df = df.with_columns(
        pl.when(pl.col(pl.Utf8).str.strip() == '').then(None).otherwise(pl.col(pl.Utf8)).keep_name()
    )
    df = df.with_columns(
        pl.col(pl.Utf8).str.strip().keep_name()
    )
    return df

In [384]:
def remove_leading_zeros(df):
    df = df.with_columns(
        pl.col('code_orig').str.lstrip('0').alias('code')
    )
    return df

In [385]:
def move_modifiers(df):
    df = df.with_columns([
        pl.col('code').str.slice(offset = 0, length = 5).alias('code'),
        pl.col('code').str.slice(offset = 5, length = None).alias('modifier'),
        pl.lit('hcpcs_cpt').alias('code_prefix'),
        
    ])
    return df

In [386]:
def split_modifier_string(s):
    chunks = [s[i:i+2] for i in range(0, len(s), 2)]
    return " ".join(chunks)

In [387]:
def split_modifier(df):
    df = df.with_columns(
        pl.col('modifier').apply(split_modifier_string)
    )
    
    return df

In [388]:
main_df = (df
           .pipe(get_main_df)
           .pipe(remove_leading_zeros)
           .pipe(move_modifiers)
           .pipe(remove_empty_strings)
           .pipe(split_modifier)
          )

In [389]:
main_df.head()

internal_code,code_orig,description,Gross Charge,Discounted Cash Price (Gross Charges),code,modifier,code_prefix
str,str,str,str,str,str,str,str
"""211901""","""0J0248""","""REMDESIVIR 100...","""1950.00""","""1950.00""","""J0248""",,"""hcpcs_cpt"""
"""216434""","""0C9399""","""OLUMIANT 1 MG ...","""535.14""","""535.14""","""C9399""",,"""hcpcs_cpt"""
"""216436""","""0C9399""","""OLUMIANT 2 MG ...","""535.14""","""535.14""","""C9399""",,"""hcpcs_cpt"""
"""226271""","""0Q0249""","""TOCILIZUMAB 1M...","""21.08""","""21.08""","""Q0249""",,"""hcpcs_cpt"""
"""229888""",,"""HEATED HIGH FL...","""4.32""","""4.32""",,,"""hcpcs_cpt"""


In [390]:
payer_dfs = df.pipe(get_payer_dfs)

In [391]:
pdf = payer_dfs[0]

In [392]:
def handle_pct_billed(df):
    rate_method_str = '% of BC|% of MCR|% of FS|% of MCD'
    df = df.with_columns([
        pl.when(pl.col('rate').str.contains(rate_method_str)).then(pl.col('rate')).otherwise(None).alias('rate_method'),
        pl.when(pl.col('rate').str.contains(rate_method_str)).then(None).otherwise(pl.col('rate')).alias('rate'),
    ])
    
    return df

In [393]:
def get_code_prefix(df):
    
    code_pat = pl.col('code_orig').str.contains
    
    df = df.with_columns(
        (pl
         .when(code_pat('CPT/HCPC')).then('hcpcs_cpt')
         .when(code_pat('MS-DRG')).then('ms-drg')
         .alias('code_prefix')
    ))
    
    return df

In [416]:
def explode_codes(df):
    
    df = df.with_columns(
        pl.col('code_orig').str.strip(f';').str.split(' ').alias('code')
    ).explode('code')
    
    cpt_pat_1 = '\d{5}'
    cpt_pat_2 = '[A-Z]\d{4}'
    cpt_pat_3 = '\d{4}[A-Z]'
    
    cpt_match_1 = f'{cpt_pat_1}|{cpt_pat_1}-{cpt_pat_1}'
    cpt_match_2 = f'{cpt_pat_2}|{cpt_pat_2}-{cpt_pat_2}'
    cpt_match_3 = f'{cpt_pat_3}|{cpt_pat_3}-{cpt_pat_3}'
    cpt_match = pl.col('code').str.contains(f'{cpt_match_1}|{cpt_match_2}|{cpt_match_3}')
    
    drg_pat = '[0-9]{3}'
    drg_match = pl.col('code').str.contains(f'{drg_pat}|{drg_pat}-{drg_pat}')
    
    df = df.filter(
        (cpt_match & (pl.col('code_prefix') == 'hcpcs_cpt')) | 
        (drg_match & (pl.col('code_prefix') == 'ms-drg'))
    )
    
    df = df.with_columns(pl.col('code').str.strip(',').str.strip(';'))
    
    return df

In [417]:
def check_range(s):
    # checks that we don't have ranges like ABC-ABCD, which can't be interpolated
    sp = s.split('-')
    if len(sp) == 1:
        return True
    elif len(sp) == 2:
        if len(sp[0]) == len(sp[1]):
            return True
        return False
    return False

In [420]:
pdf = (pl
    .concat(payer_dfs)
    .pipe(handle_pct_billed)
    .pipe(get_code_prefix)
    .pipe(explode_codes)
    .filter(pl.col('code').apply(check_range))
   )

In [421]:
pdf

description,code_orig,rate,payer_orig,payer_category,rate_method,code_prefix,code
str,str,str,str,str,str,str,str
"""Angioplasty""","""CPT/HCPC 35450...","""$6,426.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""35450-35476"""
"""Angioplasty""","""CPT/HCPC 35450...","""$6,426.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""37285"""
"""Angioplasty""","""CPT/HCPC 35450...","""$6,426.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""92980-92997"""
"""Angioplasty""","""CPT/HCPC 35450...","""$6,426.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""93600-93662"""
"""Angioplasty""","""CPT/HCPC 35450...","""$6,426.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""C9600-C9604"""
"""Cardiac Cath""","""CPT/HCPC 93451...","""$2,940.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""93451-93462"""
"""Cardiac Cath""","""CPT/HCPC 93451...","""$2,940.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""93503-93533"""
"""Cyber Knife""","""CPT/HCPC 77372...","""$10,920.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""77372"""
"""Cyber Knife""","""CPT/HCPC 77373...","""$3,780.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""77373"""
"""ESWL Lithotrip...","""CPT/HCPC 50590...","""$3,311.00""","""Access 2 WFD""","""payer""",,"""hcpcs_cpt""","""50590"""


In [422]:
def interpint(lo, hi):
    length = len(lo)
    lo, hi = int(lo), int(hi)
    lo, hi = min(lo, hi), max(lo, hi)
    arr    = [str(c).zfill(length) for c in range(lo, hi+1)]
    return arr

def interp(s):
    s = s.split('-')
    
    if len(s) == 1:
        return [s[0]]
    
    if all(c.isdigit() for c in s):
        lo, hi = s
        return interpint(lo, hi)
    
    if not any(c.isdigit() for c in s):
        lo, hi = s
        
        if lo[0].isalpha() and lo[1:].isdigit():
            assert lo[0] == hi[0]
            alph = lo[0]
            lo_int = lo[1:]
            hi_int = hi[1:]
            arr = [alph + c for c in interpint(lo_int, hi_int)]
            
            return arr
        
        elif lo[-1].isalpha() and lo[:-1].isdigit():
            alph = lo[-1]
            assert lo[-1] == hi[-1]
            lo_int = lo[:-1]
            hi_int = hi[:-1]
            arr = [c + alph for c in interpint(lo_int, hi_int)]
            
            return arr
        
    return []

In [423]:
def interpolate_codes(df):
    df = df.with_columns(
        pl.col('code').apply(interp).alias('interp')
    ).explode('interp')
    return df

In [429]:
def get_range_len(s):
    return len(interp(s))

In [425]:
t = pdf.pipe(interpolate_codes)

In [436]:
pdf.with_columns(
    pl.col('code').apply(get_range_len).alias('rangelen')
).sort('rangelen').select(['description', 'code', 'rangelen']).unique().tail(20).to_pandas()

Unnamed: 0,description,code,rangelen
0,Other Surgical Services,59026-62269,3244
1,Other Surgical Services,54251-59019,4769
2,Other Surgical Services,64531-69999,5469
3,Other Surgical Services,23351-29064,5714
4,Other Surgical Services,29581-36399,6819
5,Lab/Path/Transfusions,80047-89356,9310
6,Lab/Path/Transfusions,80047-89398,9352
7,OP Other,J0120-J9999,9880
8,CT,70000-79999,10000
9,MRI,70000-79999,10000
