In [1]:
import polars as pl

### Expressions for defining columns

In [2]:
def payer_category():
    return (
        pl.when(pl.col('payer_name') == 'Gross_Charge').then('gross')
        .when(pl.col('payer_name') == 'Cash_Charge').then('cash')
        .when(pl.col('payer_name') == 'Min_Negotiated_Rate').then('min')
        .when(pl.col('payer_name') == 'Max_Negotiated_Rate').then('max')
        .otherwise('payer').alias('payer_category')
    )

def hcpcs_cpt_and_drg():
    
    hcpcs_cpt = pl.when(pl.col('line_type') == '2-CPT').then(pl.col('code')).alias('hcpcs_cpt')
    ms_drg = pl.when(pl.col('line_type') == '3-DRG').then(pl.col('code')).str.zfill(3).alias('ms_drg')
    
    return hcpcs_cpt, ms_drg

def all_charges():
    
    standard_charge_percent = (
        pl.col('standard_charge')
        .str.extract('(\d+)% of gross charges')
        .alias('standard_charge_percent')
    )
    
    contracting_method = (
        pl.when(pl.col('standard_charge').str.contains('\% of gross charges'))
        .then('percent of total billed charge')
        .alias('contracting_method')
    )
    
    standard_charge = (
        pl.when(pl.col('standard_charge').str.contains('\% of gross charges'))
        .then(None).otherwise(pl.col('standard_charge')).alias('standard_charge')
    )

    return standard_charge, standard_charge_percent, contracting_method

In [3]:
file = '474063406_Ascension Saint Thomas Highlands_standardcharges.xlsx'

# These are things I learned from trial and error working with these files
# and also examining them in Numbers
read_csv_options = {
    'skip_rows': 1, 
    'infer_schema_length': 0, 
    'null_values': ['N/A']
}

### Trying a single file: an easy and fast workflow

My workflow is to build a single parenned object like
```py
(
    df
    .transformation
    .transformation
    ...
)
```
and then keep adding transformations until I get the dataframe I like. I do this in a single cell.

So I started like

```py
(
    pl.read_excel(file, sheet_id = 2, read_csv_options = read_csv_options)
)
```
and noticed that I could slice off the first 3 rows, so then I did
```py
(
    pl.read_excel(file, sheet_id = 2, read_csv_options = read_csv_options)
    .slice(3,)
)
```
and so on. What I ended up with was this:

In [4]:
(
    pl.read_excel(file, sheet_id = 2, read_csv_options = read_csv_options)
    .slice(3,)
    .drop(['Facility_BU_ID', 'UB_Revenue_Description'])
    .rename({
        'Code_Type':'line_type',
        'Code':'code',
        'Description':'description',
        'UB_Revenue_Code':'rev_code',
    })
    .melt(
        ['line_type', 'code', 'description', 'rev_code'],
        variable_name = 'payer_name',
        value_name = 'standard_charge',
    )
    .filter(pl.col('standard_charge').is_not_null())
    .with_columns([
        payer_category(),
        pl.col('rev_code').str.zfill(4).keep_name(),
        *hcpcs_cpt_and_drg(),
        *all_charges(),
    ])
).sample(10)

line_type,code,description,rev_code,payer_name,standard_charge,payer_category,hcpcs_cpt,ms_drg,standard_charge_percent,contracting_method
str,str,str,str,str,str,str,str,str,str,str
"""3-DRG""","""820""","""Lymphoma And L…",,"""U57_UNITED_WC_…","""75123.55""","""payer""",,"""820""",,
"""1-CDM""","""42616672""","""CTA HEAD""","""0351""","""Min_Negotiated…","""813""","""min""",,,,
"""1-CDM""","""1626345""","""CLONIDINE 0.1 …","""0637""","""Max_Negotiated…","""50.94""","""max""",,,,
"""1-CDM""","""1630474""","""HYDROMORPHONE …","""0637""","""Gross_Charge""","""1.8""","""gross""",,,,
"""3-DRG""","""737""","""Uterine And Ad…",,"""H83_HUMANA_MED…","""15466.8""","""payer""",,"""737""",,
"""3-DRG""","""156""","""Other Ear, Nos…",,"""H02_CIGNA_MEDI…","""4941.68""","""payer""",,"""156""",,
"""1-CDM""","""1629069""","""NALOXONE 0.4 M…","""0251""","""Max_Negotiated…","""83.84""","""max""",,,,
"""1-CDM""","""37805971""","""SPLINT ORTHOGL…","""0270""","""Max_Negotiated…","""459.06""","""max""",,,,
"""3-DRG""","""787""","""Cesarean Secti…",,"""H46_AETNA_MCAR…","""8145.61""","""payer""",,"""787""",,
"""3-DRG""","""806""","""Vaginal Delive…",,"""H50_SMARTHEALT…","""7774.94""","""payer""",,"""806""",,


Let's turn this into a function so that we can process any excel file (assuming they're formatted the same way. We'll inspect this later.

In [22]:
def process(url):
    
    # Putting my notes here as I go:
    
    # Around the 20th file in: upon trying to cast 
    # 'standard_charge' to float, I got:
    
    # ComputeError: strict conversion from `str` to `f64` failed for value(s) 
    # ["85% of gross charges", "65% of gross charges", … "82% of gross charges"]; 
    # if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`
    
    # So this tells me that this a relatively uncommon thing, but that it does happen
    
    df = (
        pl.read_excel(url, sheet_id = 2, read_csv_options = read_csv_options)
        .slice(3,)
    )
    
    # This column is present in most of the files, but not all:
    if 'Facility_BU_ID' in df.columns:
        df = df.drop('Facility_BU_ID')
    
    df = (
        df
        .drop('UB_Revenue_Description')
        .rename({
            'Code_Type':'line_type',
            'Code':'code',
            'Description':'description',
            'UB_Revenue_Code':'rev_code',
        })
        .melt(
            ['line_type', 'code', 'description', 'rev_code'],
            variable_name = 'payer_name',
            value_name = 'standard_charge',
        )
        .filter(pl.col('standard_charge').is_not_null())
        .with_columns([
            payer_category(),
            pl.col('rev_code').str.zfill(4).keep_name(),
            *hcpcs_cpt_and_drg(),
            *all_charges(),
        ])
    )
    
    return df

### Getting the MRF links from the Ascension page

In [23]:
from bs4 import BeautifulSoup
import requests

headers = {
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

mrf_page = 'https://healthcare.ascension.org/price-transparency/price-transparency-files'

resp = requests.get(mrf_page, headers = headers)

In [24]:
soup = BeautifulSoup(resp.content)
mrfs = []
base_url = 'https://healthcare.ascension.org'
for link in soup.find_all('a'):
    if '.xlsx' in link.get('href'):
        mrfs.append((base_url + link.get('href'), link.text))
        
print(f'The number of MRFs found was {len(mrfs)}')

The number of MRFs found was 116


Here's an example of what's in the list of MRF tuples: it's just the link to the MRF, plus the hospital name as shown on the Ascension website.

In [25]:
mrfs[12]

('https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/fl/300577249_ascension-sacred-heart-gulf_standardcharges.xlsx',
 'Ascension Sacred Heart Gulf')

### Looping through all the Ascension MRFs

In [26]:
from tqdm import tqdm
from glob import glob
import os

In [28]:
for mrf_url, hospital_name in tqdm(mrfs):
    # Process each Excel file and save the processed data
    # in a .parquet file
    
    if not os.path.exists('data'):
        os.mkdir('data')
        
    basename = os.path.basename(mrf_url)
    
    # We want to omit files without the EIN for now
    if not basename[:9].isdigit():
        continue

    
    path = f'data/{basename}'
    filename, ext = os.path.splitext(basename)
    
    parquet_name = f'{filename}.parquet'
    parquet_path = 'data/' + parquet_name
    
    # Ascension won't serve you the binary unless you have the
    # right headers. So it's necessary to download the file with
    # requests and then open it up in a dataframe library.
    # This is fine, since we can then cache the files for later.
    
    if parquet_path in glob('data/*'):
        # We've already processed the file
        continue
    
    if path not in glob('data/*'):
        resp = requests.get(mrf_url, headers = headers)
        output = open(path, 'wb')
        output.write(resp.content)
        output.close()

    try:
        df = process(path)    
        df.write_parquet(parquet_path)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"The problematic url was: {mrf_url}")

 47%|██████████████████████████████████████████████████████████████████▍                                                                         | 55/116 [00:02<00:02, 22.00it/s]

An error occurred: Error: potential invalid date format.
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/381358212_ascension-providence-hospital-southfield-campus_standardcharges.xlsx


 50%|██████████████████████████████████████████████████████████████████████                                                                      | 58/116 [00:04<00:05, 10.40it/s]Exception ignored in: <function Xlsx2csv.__del__ at 0x7f9bf0181940>
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.9/site-packages/xlsx2csv.py", line 219, in __del__
    self.ziphandle.close()
AttributeError: 'Xlsx2csv' object has no attribute 'ziphandle'


An error occurred: Error: potential invalid date format.
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/381358212_ascension-providence-hospital-novi-campus_standardcharges.xlsx
An error occurred: Invalid xlsx file: data/381360526_ascension-borgess-hospital-and-ascension-borgess-pipp-hospital_standardcharges.xlsx
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/381360526_ascension-borgess-hospital-and-ascension-borgess-pipp-hospital_standardcharges.xlsx


 55%|█████████████████████████████████████████████████████████████████████████████▏                                                              | 64/116 [00:06<00:06,  8.49it/s]Exception ignored in: <function Xlsx2csv.__del__ at 0x7f9bf0181940>
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.9/site-packages/xlsx2csv.py", line 219, in __del__
    self.ziphandle.close()
AttributeError: 'Xlsx2csv' object has no attribute 'ziphandle'
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:06<00:00, 18.64it/s]

An error occurred: Error: potential invalid date format.
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/381359180_ascension-borgess-allegan-hospital_standardcharges.xlsx
An error occurred: Bad magic number for file header
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/381359247_asccension-providence-rochester-hospital_standardcharges.xlsx
An error occurred: Invalid xlsx file: data/380997730_ascension-st-marys-hospital_standardcharges.xlsx
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/mi/380997730_ascension-st-marys-hospital_standardcharges.xlsx
An error occurred: Error: potential invalid date format.
The problematic url was: https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/tx/741109643_ascension-seton-shoal-creek-ip-ps




### Some problematic files:

* 272192831_ascension-st-vincent-dunn_standardcharges.xlsx doesn't have a Facility_BU_ID column (everything else is the same)
* ascension-medical-group-st-vincent-urgent-care.xlsx doesn't match the schema at all

### Looking at the files as a whole

In [29]:
import polars as pl
from glob import glob

In [40]:
[ for f in glob('data/*.parquet')]

['630288861_ascension-providence_standardcharges',
 '474063232_ascension-saint-thomas-stones-river_standardcharges',
 '741109643_ascension-seton-edgar-b-davis_standardcharges',
 '362235165_amita-health-resurrection-medical-center-chicago_standardcharges',
 '593650609_ascension-st-vincents-st-johns-county_standardcharges',
 '474063406_ascension-saint-thomas-highlands_standardcharges',
 '461523194_ascension-st-vincents-clay-county_standardcharges',
 '562592868_midwest-orthopedic-specialty-hospital_standardcharges',
 '721526400_wamego-health-center_standardcharges',
 '741109643_ascension-seton-hays_standardcharges',
 '454243702_ascension-st-vincent-fishers_standardcharges',
 '630288864_saint-vincents-birmingham_standardcharges',
 '391264986_ascension-all-saints-hospital-wisconsin-avenue-campus_standardcharges',
 '350992717_ascension-st-vincent-kokomo-_standardcharges',
 '810935368_saint-vincents-chilton_standardcharges',
 '741109636_ascension-providence-depaul-distinct-part-unit_standardc

In [49]:
df = pl.concat(pl.read_parquet(f).with_columns(pl.lit(f.split('/')[1].split('.')[0]).alias('file')) for f in glob('data/*.parquet'))

In [50]:
df.filter(~pl.col('hcpcs_cpt').str.contains('^[A-Z]\d{4}$|^\d{5}$|^\d{4}[A-Z]$')).sample(10).to_pandas()

Unnamed: 0,line_type,code,description,rev_code,payer_name,standard_charge,payer_category,hcpcs_cpt,ms_drg,standard_charge_percent,contracting_method,file
0,2-CPT,20690.0,APPLICATION UNIPLANE EXTERNAL FIXATION SYSTEM,,Min_Negotiated_Rate,966.8,min,20690.0,,,,390905385_ascension-calumet-hospital_standardc...
1,2-CPT,812.0,Anes lwr intst scr colsc,,BCKSFED_BLUE_CROSS_FEP_BLUE_FOCUS_544,540.92,payer,812.0,,,,721526400_wamego-health-center_standardcharges
2,2-CPT,12031.0,REPAIR INTERMEDIATE S/A/T/E 2.5 CM/<,,NETPRESNE_NETWORK_HEALTH_PRESTIGENE_764,513.01,payer,12031.0,,,,390905385_ascension-calumet-hospital_standardc...
3,2-CPT,15738.0,MUSC MYOCUTANEOUS/FASCIOCUTANEOUS FLAP LXTR,,Min_Negotiated_Rate,2880.74,min,15738.0,,,,390905385_ascension-calumet-hospital_standardc...
4,2-CPT,22310.0,CLTX VRT BDY FX W/O MANJ REQ&W/CSTING/BRACING,,Max_Negotiated_Rate,923.24,max,22310.0,,,,390905385_ascension-calumet-hospital_standardc...
5,2-CPT,21282.0,LATERAL CANTHOPEXY,,EOSUMR8_UMRBOX_30541EOS_795,1257.37,payer,21282.0,,,,390905385_ascension-calumet-hospital_standardc...
6,2-CPT,21268.0,ORBITAL REPOSITIONING W/BONE GRAFTS ICRA & XTRC,,Min_Negotiated_Rate,3639.56,min,21268.0,,,,390905385_ascension-calumet-hospital_standardc...
7,2-CPT,11305.0,SHAVING SKIN LESION 1 S/N/H/F/G DIAM 0.5 CM/<,,UHCCP_DO_NOT_USE_2272020_795,67.0,payer,11305.0,,,,390905385_ascension-calumet-hospital_standardc...
8,2-CPT,15758.0,FREE FASCIAL FLAP W/MICROVASCULAR ANASTOMOSIS,,UHCCPALL_ALL_SAVERSAMSBX31375UHC_795,7423.41,payer,15758.0,,,,390905385_ascension-calumet-hospital_standardc...
9,2-CPT,21282.0,LATERAL CANTHOPEXY,,UHCCPMEDI_MEDICABX_30990UHC_795,1257.37,payer,21282.0,,,,390905385_ascension-calumet-hospital_standardc...


Findings:
* some CPT codes look like: **21552.000000** | EXC TUMOR SOFT TIS NECK/ANT THORAX SUBQ 3 CM. We can safely remove these 6 zeros. These all come from the file: 390905385_ascension-calumet-hospital_standardcharges
* some rows have description = `- None -`. We can safely remove these rows.
* some lines look like: 2-CPT | **0346T+** │ Ultrasound elastography. These are add-on codes https://www.medicalcodemode.com/cpt-code-symbols We can extract the CPT from them.
* 2-CPT │ **368X8**   Stent plmt ctr dialysis seg reveals the existence of temporary or filler codes.
* 2-CPT │ **ATP23**   Auto.Test Panel Pricing Code, 23 Tests -- these are not HCPCS codes

I also found these codes, which I didn't understand:

     PB111       | REDUCED PHYSICAL FUNCTION , ADL INDEX 2-5/5-DAY OR READMISSION RETURN AND SCSA OR 5-DAY OR READMISSION RETURN AND SCPA OR 5-DAY OR READMISSION RETURN AND CCA.
     PA107       │ PHYSICAL FUNCTION REDUCED, ADL INDEX 4 - 5, NOT RECEIVING NURSING REHABILITATION/MEDICARE 14 DAY ASSESSMENT (FULL OR COMPREHENSIVE)
     CC101       │ CLINICALLY COMPLEX , ADL INDEX 6-10/STAND-ALONE OBRA SCSA OR SCPA OR CCA.
     RLX11       │ LOW REHAB PLUS EXTENSIVE , ADL INDEX 2-16/5-DAY OR READMISSION RETURN AND SCSA OR 5-DAY OR READMISSION RETURN AND SCPA OR 5-DAY OR READMISSION RETURN AND CCA.
     RHL32       │ HIGH REHAB PLUS EXTENSIVE , ADL INDEX 2-10/30-DAY AND SOT OMRA (NOT MEDICARE SHORT STAY).
     BB107       │ BEHAVIOR ONLY, ADL INDEX 6 - 10, NOT RECEIVING NURSING REHABILITATION/MEDICARE 14 DAY ASSESSMENT (FULL OR COMPREHENSIVE)
     PE217       │ REDUCED PHYSICAL FUNCTION , ADL INDEX 15-16/5-DAY OR READMISSION RETURN AND MEDICARE SHORT STAY ASSESSMENT AND SCSA OR 5-DAY OR READMISSION RETURN AND MEDICARE SHORT STAY ASSESSMENT AND …
     IA211       │ IMPAIRED COGNITION, ADL INDEX 4 - 5/MEDICARE 5 DAY ASSESSMENT (COMPREHENSIVE) AND INITIAL ADMISSION ASSESSMENT
    
However I did learn that a lot of Anesthesia codes, which start with 0, were improperly truncated:

    820         │ Anesth abdominal wall surg
    830         │ Anesth repair of hernia
    832         │ Anesth repair of hernia
    834         │ Anesth hernia repair < 1 yr
    836         │ Anesth hernia repair preemie
    840         │ Anesth surg lower abdomen
    842         │ Anesth amniocentesis
    844         │ Anesth pelvis surgery
    846         │ Anesth hysterectomy
    848         │ Anesth pelvic organ surg
    862         │ Anesth kidney/ureter surg
    864         │ Anesth removal of bladder
    865         │ Anesth removal of prostate
    866         │ Anesth removal of adrenal
    868         │ Anesth kidney transplant
    870         │ Anesth bladder stone surg
    872         │ Anesth kidney stone destruct
    880         │ Anesth abdomen vessel surg
    882         │ Anesth major vein ligation
    902         │ Anesth anorectal surgery
    904         │ Anesth perineal surgery
    906         │ Anesth removal of vulva
    908         │ Anesth removal of prostate

In [51]:
non_cpts = ['PB111', 'PA107', 'CC101', 'RLX11', 'RHL32', 'BB107', 'PE217', 'IA211']

def clean_hcpcs_cpt():
    hcpcs_cpt = (
        pl.col('hcpcs_cpt')
        .str.strip()
        # some add-on codes have a plus sign
        .str.replace('\+', '')
        # 390905385_ascension-calumet-hospital_standardcharges has these
        .str.replace('.000000$', '')
        
    )
    
    hcpcs_cpt = (
        # APT codes
        pl.when(hcpcs_cpt.str.contains('^ATP')).then(None)
        
        # temporary or placeholder CPTs
        .when(hcpcs_cpt.str.contains('\d{3}X(\d|\w)')).then(None)
        
        .when(hcpcs_cpt.is_in(non_cpts)).then(None)
        
        .when(hcpcs_cpt == '-----').then(None)
        # Anesthesia codes
        
        .when(pl.col('description').str.contains('^Anes|^Hosp manage cont drug admin') & pl.col('hcpcs_cpt').str.lengths() < 5).then(hcpcs_cpt.str.zfill(5))
        
        .otherwise(hcpcs_cpt)
    )
    
    return hcpcs_cpt.alias('hcpcs_cpt')

In [None]:
df = df.with_columns(clean_hcpcs_cpt())

In [None]:
df.write_parquet('ascension.parquet')