### Sutter Health

We have quite a few of these in our database and I thought it might be instructive to pull some of them.

I'm gonna use the (unauthenticated) DoltHub API to get the files. I already manually checked that the files matched the right hospitals.

In [3]:
import requests

owner = 'dolthub'
repo = 'standard-charge-files'
branch = 'main'
standard_charge_file_indirect_url = 'https://www.sutterhealth.org/for-patients/healthcare-cost-transparency'

query = f"""
SELECT ccn, doing_business_as_name, standard_charge_file_url 
FROM hospitals 
where {standard_charge_file_indirect_url=}
"""

res = requests.get(
    "https://www.dolthub.com/api/v1alpha1/{}/{}".format(owner, repo, branch),
    params={"q": query},
)

Let's put these in a dataframe:

In [4]:
import polars as pl
files = pl.DataFrame(res.json()['rows'])

Sometimes the last_updated is in the first row of the file, in the second column.

In [8]:
def get_last_updated(df):
    return df.columns[1]

Sometimes there's header metadata. This looks for any time a row contains the header
columns, then renames the columns to match that row. Then we slice off the rows that
contain the metadata. We limit ourselves to searching the first 10 rows

In [20]:
def find_header_row(df):

    header_row_cols = ['ID', 'SERVICE_SETTING', 'DESCRIPTION', 'CPT', 'NDC', 'REVENUE_CODE']    

    while not all(c in df.columns for c in header_row_cols):
        df = df.rename(df.to_dicts()[0])
        df = df.slice(1,)
        
    return df

Rename the cols to fit the unified schema.

In [21]:
rename_dict = ({
    'ID':'local_code',
    'SERVICE_SETTING':'setting',
    'DESCRIPTION':'description',
    'CPT':'hcpcs_cpt',
    'NDC':'ndc',
    'REVENUE_CODE':'rev_code',
})

The MSDRG and APR-DRG codes are actually hidden in the "internal_code" column. So we extract them out.

In [22]:
def extract_codes():
    col = pl.col('local_code')
    
    ms_drg = col.str.extract('MSDRG-(\d{3})').alias('ms_drg')
    apr_drg = col.str.extract('APRDRG-(\d{3}-\d{1})').alias('apr_drg')
    
    return ms_drg, apr_drg

In [23]:
def clean_stdchg():
    
    return (
        pl.col('standard_charge')
        .str.replace('\$', '')
        .str.replace_all(',', '')
        .str.strip()
        .keep_name()
    )

In [24]:
def payer_cat():
    
    is_in_col = pl.col('payer').str.to_lowercase().str.contains
    
    expr = (
        pl.when(is_in_col('gross ')).then('gross')
        .when(is_in_col('cash ')).then('cash')
        .when(is_in_col('minimum ')).then('min')
        .when(is_in_col('maximum ')).then('max')
        .otherwise('payer').alias('payer_category')
    )
    
    return expr

In [25]:
def fill_null_pks(df):
    
    primary_keys = ['hospital_id','local_code','code','ms_drg',
                    'apr_drg','hcpcs_cpt','modifiers','ndc',
                    'rev_code', 'billing_class', 'setting','payer','plan'
                   ]
    
    to_fill = [c for c in df.columns if c in primary_keys]
    df = df.with_columns(
        pl.col(to_fill).fill_null('').keep_name()
    )
    return df

In [52]:
def clean_hcpcs():
    
    # There are generic HCPCS codes like 30XX0
    # that appear to stand for code ranges, but it's hard to tell
    # Around 600 of them in the entire dataset. They don't 
    # fit the schema, so we're gonna null them. The code
    # is repeated in the local_code column regardless, as well
    # as the description
    
    # Also, some of the HCPCS codes are lowercased for some reason
    
    col = pl.col('hcpcs_cpt')

    middle_x = col.str.to_uppercase().str.contains('X') & ~col.str.contains('^X|X$')
    double_x = col.str.to_uppercase().str.contains('XX')
    
    return (
        pl.when(middle_x | double_x)
        .then(None)
        .otherwise(col.str.to_uppercase())
        .alias('hcpcs_cpt')
    )

In [53]:
def extract_ein(url):
    
    ein = url.split('/')[-1].split('-')[0]
    ein_dashed = ein[:2] + '-' + ein[2:]
    return ein_dashed

In [54]:
def extract_filename(url):
    
    return url.split('/')[-1]

In [80]:
from tqdm import tqdm

transparency_page = 'https://www.sutterhealth.org/for-patients/healthcare-cost-transparency'

queries = []
dfs = []

for row in tqdm(files.rows()):
    
    id, dba, stdchg_file_url = row
    ein = extract_ein(stdchg_file_url)
    file_name = extract_filename(stdchg_file_url)
    
    # this one's busted
    if id == '124001':
        continue
    
    df = pl.read_csv(
        stdchg_file_url, 
        encoding = 'latin-1', 
        infer_schema_length = 0, 
        null_values = ['NULL']
    )
    
    last_updated = get_last_updated(df)
    try:
        mm, dd, yy = last_updated.split('/')
        last_updated = f'{yy}-{mm}-{dd}'
    except ValueError:
        last_updated = '2021-01-01'
    
    query = f"""
    update hospital set 
    {ein=}, 
    {file_name=}, 
    {last_updated=}, 
    {stdchg_file_url=}, 
    {transparency_page=} 
    where {id=}
    """
    queries.append(query)

    id_vars = list(rename_dict.values())

    df = (
        df
        .pipe(find_header_row)
        .rename(rename_dict)
        .melt(
            id_vars = id_vars,
            variable_name = 'payer',
            value_name = 'standard_charge',                       
        )
        .with_columns([
            *extract_codes(),
            clean_stdchg(),
            payer_cat(),
            pl.col('setting').str.to_lowercase(),
            pl.col('hcpcs_cpt').str.to_uppercase(),
            pl.col('rev_code').str.zfill(4),
            pl.lit(id).alias('hospital_id')
        ])
        .with_columns(
            clean_hcpcs()
        )
        .filter(pl.col('standard_charge').is_not_null())
        .unique()
    )
    
    dfs.append(df)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:47<00:00,  1.97s/it]


In [81]:
df = pl.concat(dfs)

In [82]:
df = df.pipe(fill_null_pks)

I had to correct some rows upon importing:

In [83]:
def correction_singlet():
    """
    cause: string 'J90739' is too large for column 'hcpcs_cpt'
    A bad row was encountered: [050108,<nil>,HEPATITIS B RECOMBINANT 20MCG/0.5ML SOSY,0636,IP_RX-81000008-ERX309777,,,,,J90739,,<nil>,<nil>,<nil>,43528-003-05,<nil>,<nil>,<nil>,1,inpatient,gross,Gross Charge,,975.00,<nil>,<nil>,<nil>]: string 'J90739' is too large for column 'hcpcs_cpt'
    
    I learned that this was a miscoding of J90739 --> 90739 (google search.)
    """
    expr = (pl.when(pl.col('hcpcs_cpt') == 'J90739').then('90739').otherwise(pl.col('hcpcs_cpt')).keep_name())
    
    return expr

In [84]:
def correction_modifiers():
    """
    cause: string '8707059' is too large for column 'hcpcs_cpt'
    A bad row was encountered: [050766,<nil>,AEROBIC CULTURE - SEPARATE,0300,IP-3008707059,,,,,8707059,,<nil>,<nil>,<nil>,,<nil>,<nil>,<nil>,1,inpatient,gross,Gross Charge,,85.00,<nil>,<nil>,<nil>]: string '8707059' is too large for column 'hcpcs_cpt'
    
    >> this should be 87070 59 (modifier)
    """
    col = pl.col('hcpcs_cpt')
    return (
        pl.when(col.str.lengths() == 7).then(col.str.slice(5,)).otherwise('').alias('modifiers'),
        pl.when(col.str.lengths() == 7).then(col.str.slice(0,5)).otherwise(col).alias('hcpcs_cpt')
    )

In [86]:
df = df.with_columns([
    correction_singlet(),
    *correction_modifiers(),
])

In [88]:
df.write_csv('sutter.csv')

This is what I used to modify the hospitals table:

In [63]:
with open('sutter.sql', 'w+') as f:  
    for q in queries:
        f.write(q + ';\n')
        
# dolt sql < sutter.sql