In [1]:
from glob import glob

In [5]:
files = glob('csvs/*.csv')

In [6]:
files

['csvs/62-6010402_JMCGH_standardcharges.csv',
 'csvs/58-1884314_CGH_standardcharges.csv',
 'csvs/82-5179383_Dyersburg_standardcharges.csv',
 'csvs/82-5187448_Volunteer_standardcharges.csv',
 'csvs/62-1753289_MGH_standardcharges.csv',
 'csvs/62-1624171_BGH_standardcharges.csv']

In [7]:
def match_ccn(filename):
    ccns = [
        ('JMCGH', '440002'),
        ('CGH', '441316'),
        ('Dyersburg', '440072'),
        ('Volunteer', '440061'),
        ('MGH', '440060'),
        ('BGH', '441320'),
    ]
    for string, ccn in ccns:
        if string in filename:
            return ccn

In [13]:
data_rows = []
for file in files:
    filename = file.split('/')[1]
    data_rows.append(
        (
            file,
            filename.split('_')[0],
            match_ccn(filename),
            filename,
            'https://www.wth.org/wp-content/uploads/2022/11/' + filename,
        )
    )

In [42]:
import polars as pl

def pipeline(data_row):
    file, hospital_ein, hospital_ccn, filename, url = data_row
    df = pl.read_csv(file, encoding = 'latin-1', skip_rows = 2, infer_schema_length = 0, null_values = ["N/A"])
    
    df = df.rename({
        'LINE TYPE':'code_type', 
        'CHARGE CODE/ PACKAGE':'internal_code',
        'CHARGE DESCRIPTION':'description',
        'DRG':'ms-drg', 
        'APC':'apc', 
        'CPT®/HCPCS':'hcpcs_cpt', 
        'ALTERNATE CPT®/HCPCS':'hcpcs_cpt_2', 
        'REV CODE':'rev_code', 
        'MODIFIER':'modifier', 
        'NDC':'ndc'
    })
    payer_cols = df.columns[10:]
    
    df_rev  = df.filter(pl.col('hcpcs_cpt').is_null() & pl.col('hcpcs_cpt_2').is_null() & pl.col('ms-drg').is_null()).drop(['hcpcs_cpt', 'hcpcs_cpt_2', 'ms-drg'])
    df_cpt  = df.filter(pl.col('hcpcs_cpt').is_not_null()).drop(['hcpcs_cpt_2', 'ms-drg'])
    df_cpt2 = df.filter(pl.col('hcpcs_cpt_2').is_not_null()).drop(['hcpcs_cpt', 'ms-drg'])
    df_drg  = df.filter(pl.col('ms-drg').is_not_null()).drop(['hcpcs_cpt', 'hcpcs_cpt_2'])
    
    def melt_rates(df):
        id_vars = [c for c in df.columns if c not in payer_cols]
        value_vars = payer_cols
        variable_name = 'payer_orig'
        value_name = 'rate'
        return df.melt(
            id_vars = id_vars,
            value_vars = value_vars,
            variable_name = variable_name,
            value_name = value_name
        )
    
    df_rev = melt_rates(df_rev)
    df_cpt = melt_rates(df_cpt)
    df_cpt2 = melt_rates(df_cpt2)
    df_drg = melt_rates(df_drg)
    
    df_cpt = df_cpt.rename({'hcpcs_cpt':'code'})
    df_cpt2 = df_cpt2.rename({'hcpcs_cpt_2':'code'})
    df_drg = df_drg.rename({'ms-drg':'code'})
    df_rev = df_rev.with_column(pl.lit('na').alias('code'))
    
    df_cpt = df_cpt.with_columns(
    pl.lit('hcpcs_cpt').alias('code_prefix')
    )
    
    df_cpt2 = df_cpt2.with_columns(
        pl.lit('hcpcs_cpt').alias('code_prefix')
    )
    
    df_drg = df_drg.with_columns(
        pl.lit('ms-drg').alias('code_prefix')
    )
    
    df_rev = df_rev.with_columns(
        pl.lit('none').alias('code_prefix')
    )
    
    df_cpt2 = df_cpt2.select(df_cpt.columns)
    df_drg = df_drg.select(df_cpt.columns)
    df_rev = df_rev.select(df_cpt.columns)
    
    df = pl.concat([df_rev, df_cpt, df_cpt2, df_drg])
    
    df = df.filter(pl.col('rate').is_not_null())
    df = df.filter(
        (pl.col('rate') != "Paid by Report") &
        (pl.col('rate') != "Paid By Report") &
        (pl.col('rate') != "Not Billed to Insurance") &
        (pl.col('rate') != "Paid per CPT") &
        (pl.col('rate') != "Not billed to Insurance") &
        (pl.col('rate') != "Paid per DRG")
        
    )
    
    df = df.with_columns([
        
        (pl.when(pl.col('payer_orig').str.contains('GROSS CHARGE')).then('gross')
         .when(pl.col('payer_orig').str.contains('MINIMUM')).then('min')
         .when(pl.col('payer_orig').str.contains('MAXIMUM')).then('max')
         .when(pl.col('payer_orig').str.contains('CASH')).then('cash')
         .otherwise('payer')
        ).alias('payer_category'),
        
        (
            pl.when(pl.col('rate').str.contains('per day')).then('per day')
            .when(pl.col('rate').str.contains('Per Day')).then('per day')
            .otherwise(None)
        ).alias('rate_method'),
        
        pl.col('modifier').fill_null('na'),
        
        pl.col('ndc').fill_null('na'),
        
        pl.col('rev_code').str.zfill(4).fill_null('na'),
        
        pl.when(pl.col('payer_orig').str.contains('All Plans')).then('all plans').otherwise(None).alias('plan_name'),
        
        pl.when(pl.col('payer_orig').str.contains('All Plans')).then(pl.col('payer_orig').str.replace(' All Plans', '')).otherwise(None).alias('plan_name'),
        
        
        (
            pl.col('rate').str.strip('$')
            .str.replace(',', '')
            .str.replace(' per day', '')
            .str.replace(' Per Day', '')
        ).cast(float),
        
    ])
    
    df = df.with_columns([
        pl.lit(hospital_ccn).alias('hospital_ccn'),
        pl.lit(hospital_ein).alias('hospital_ein'),
        pl.lit(url).alias('url'),
        pl.lit('2022-09-30').alias('file_last_updated'),
        pl.lit(filename).alias('filename'),
    ])
    
    df.write_csv(f'{hospital_ccn}.csv')

In [43]:
for row in data_rows:
    pipeline(row)

In [44]:
df = pl.concat([pl.read_csv(file, infer_schema_length = 0) for file in glob('4*.csv')])



In [47]:
df.sample(10).to_pandas()

Unnamed: 0,code_type,internal_code,description,apc,code,rev_code,modifier,ndc,payer_orig,rate,code_prefix,payer_category,rate_method,plan_name,hospital_ccn,hospital_ein,url,file_last_updated,filename
0,CDM,957300,AMPHETAMINE CONFIRMATION,,82542,0301,na,na,UHC Community Plan Dual Complete DSNP All Plans,0.0,hcpcs_cpt,payer,,UHC Community Plan Dual Complete DSNP,440002,62-6010402,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,62-6010402_JMCGH_standardcharges.csv
1,CPT,25310,TRANSPLANT FOREARM TENDON,5113.0,25310,0360,na,na,BCBS Network S All Plans,2109.0,hcpcs_cpt,payer,,BCBS Network S,441316,58-1884314,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,58-1884314_CGH_standardcharges.csv
2,CPT,63047,REMOVE SPINE LAMINA 1 LMBR,5114.0,63047,0360,na,na,Ambetter of TN All Plans,8667.16,hcpcs_cpt,payer,,Ambetter of TN,440061,82-5187448,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,82-5187448_Volunteer_standardcharges.csv
3,CPT,6635838,COLONOSCOPY SUBMUCOUS NJX,5312.0,45381,0750,na,na,UHC Community Plan TennCare All Plans,927.0,hcpcs_cpt,payer,,UHC Community Plan TennCare,440061,82-5187448,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,82-5187448_Volunteer_standardcharges.csv
4,CDM,6743581,NAIL T2 ANKLE ARTHRODESIS RT 11X300MM 1819-1130S,,C1713,0278,na,na,HS Technology All Plans,0.0,hcpcs_cpt,payer,,HS Technology,440061,82-5187448,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,82-5187448_Volunteer_standardcharges.csv
5,CDM,6951792,SCREW FA 17MM--3125317,,C1713,0278,na,na,UHC Community Plan Dual Complete DSNP All Plans,134.16,hcpcs_cpt,payer,,UHC Community Plan Dual Complete DSNP,441320,62-1624171,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,62-1624171_BGH_standardcharges.csv
6,DRG,MS-DRG 139,SALIVARY GLAND PROCEDURES,,139,na,na,na,BCBS Network S All Plans,16387.92,ms-drg,payer,,BCBS Network S,440002,62-6010402,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,62-6010402_JMCGH_standardcharges.csv
7,CDM,7504798-20672,acetaminophen 500 mg Tab,,A9270,0250,na,50580-0457-11,BCBS Network P All Plans,0.0,hcpcs_cpt,payer,,BCBS Network P,440072,82-5179383,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,82-5179383_Dyersburg_standardcharges.csv
8,CDM,7509579-6239,FLUoxetine 20 mg oral capsule,,A9270,0250,na,00904-5785-61,DISCOUNTED CASH PRICE (UNINSURED),2.9,hcpcs_cpt,cash,,,441316,58-1884314,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,58-1884314_CGH_standardcharges.csv
9,CDM,6336300,THORACENTESIS WO IMAGING,5181.0,32554,0360,na,na,Cigna HealthSpring Medicare Advantage All Plans,483.42,hcpcs_cpt,payer,,Cigna HealthSpring Medicare Advantage,440061,82-5187448,https://www.wth.org/wp-content/uploads/2022/11...,2022-09-30,82-5187448_Volunteer_standardcharges.csv


In [48]:
df.write_csv('alltenn.csv')

In [38]:
sorted(list(df['payer_orig'].unique()))

['AARP UHC Medicare Advantage All Plans',
 'Aetna All Plans',
 'Ambetter of TN All Plans',
 'AmeriVantage Medicare Advantage All Plans',
 'American Health Advantage CNSP All Plans',
 'Amerigroup DSNP All Plans',
 'Amerigroup Tenncare All Plans',
 'BCBS Blue Advantage All Plans',
 'BCBS Network P All Plans',
 'BCBS Network S All Plans',
 'Bluecare Plus DSNP All Plans',
 'Bluecare TennCare All Plans',
 'Cigna HealthSpring Medicare Advantage All Plans',
 'Cigna Local Plus All Plans',
 'Cigna Open Access All Plans',
 'DE-IDENTIFIED MAXIMUM NEGOTIATED RATE',
 'DE-IDENTIFIED MINIMUM NEGOTIATED RATE',
 'DISCOUNTED CASH PRICE (INSURED, SERVICE NOT COVERED BY INSURANCE)',
 'DISCOUNTED CASH PRICE (UNINSURED)',
 'GROSS CHARGE',
 'HS Technology All Plans',
 'Humana ChoiceCare All Plans',
 'Humana Medicare Advantage All Plans',
 'UHC Community Plan Dual Complete DSNP All Plans',
 'UHC Community Plan TennCare All Plans',
 'UHC-Optum VA-CCN All Plans',
 'United Healthcare All Plans']