In [1]:
import psycopg2
import requests
import pandas as pd
import io
import urllib3

First we need to establish the database connection. If you haven't already, get it running using docker compose

In [2]:
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432",
)

cur = conn.cursor()

Now to make any requests to the database for data we can use cur.execute()

In [3]:
cur.execute("SELECT * FROM machine_links INNER JOIN features ON machine_links.ccn = features.ccn") # query to get joined data
res = cur.fetchall()
df = pd.DataFrame(res)

df = df.iloc[:, 0:7] # remove the double ccn column
df.columns = ["ccn", "state_or_region", "reporting_entity_name_common", "machine_readable_url", "bed_count", "zip_code", "medicare_medicaid_eligible"]

df.set_index("ccn", inplace=True) # index by ccn

df

Unnamed: 0_level_0,state_or_region,reporting_entity_name_common,machine_readable_url,bed_count,zip_code,medicare_medicaid_eligible
ccn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10001,AL,Southeast Health Medical Center,https://www.southeasthealth.org/wp-content/upl...,"[Procedure, Code Type, Code, Hcpcs/CPT4, NDC, ...",420,36301
10005,AL,Marshall Medical Centers South Campus,https://pricetransparency.blob.core.windows.ne...,"[RecordType, FileInformation, Description, Pro...",240,35957
10006,AL,North Alabama Medical Center,https://namccares.com/sites/namc/assets/upload...,"[Facility, CDM#, Description, RevCode, CPT, HC...",338,35630
10011,AL,Saint Vincent's East,https://healthcare.ascension.org/-/media/proje...,,362,35235
10012,AL,Dekalb Regional Medical Center,https://dekalbregional.com/wp-content/uploads/...,[<html>],134,35968
...,...,...,...,...,...,...
673074,TX,Everest Rehabilitation Hospital Temple,https://www.everestrehab.com/Everest_Rehabilit...,,36,76504
673076,TX,Clearsky Rehabilitation Hospital Of Flower Mound,https://clearskyhealth.com/flowermound/wp-cont...,,41,75022
673078,TX,"Shannon Rehabilitation Hospital, An Affiliate ...",https://encompasshealth.com/-/media/healthsout...,,40,76901
673079,TX,Encompass Health Rehabilitation Hospital Of Waco,https://encompasshealth.com/-/media/healthsout...,,40,76706


To add some more insight to the provided url, we can parse the file to see which headers are provided by each hospital

In [4]:
def get_headers(f):
    # f: csv file object
    # returns list of csv headers
    f.readline()
    d_reader = csv.DictReader(f)
    headers = d_reader.fieldnames

    return headers


def parse_mrf(url):
    # url: link to machine readable file
    response = requests.get(url)
    print(response.text)
    return get_headers(response)

In [60]:
# Put column in database if it doesn't exist
cur.execute("ALTER TABLE machine_links ADD COLUMN IF NOT EXISTS csv_headers text[]")
conn.commit()

In [5]:
for row in res:
    session = requests.Session()
    retry = urllib3.util.retry.Retry(connect=3, backoff_factor=.5, raise_on_status=False)
    adapter = requests.adapters.HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    try:
        r = session.get(row[3])
    except:
        continue
    ccn = row[0]
    print(r.status_code)
    print(row[3])
    if r.status_code == 404 or 403:
        continue
    if row[3][-4:] == "xlsx":
        headers = list(pd.read_excel(r.content, engine='openpyxl').columns)
    elif row[3][-3:] == "xls":
        headers = list(pd.read_excel(r.content, engine="xlrd").columns)
    elif row[3][-3:] == "csv":
        headers = list(pd.read_csv(io.StringIO(r.content.decode('utf-8', errors="ignore"))).columns)

    cur.execute(
        "UPDATE machine_links SET csv_headers=%s WHERE ccn=%s", (headers, ccn, )
    )
    conn.commit()

200
https://www.southeasthealth.org/wp-content/uploads/Southeast-Health-Standard-Charges-2022.xlsx
200
https://pricetransparency.blob.core.windows.net/mmc/83-1651180_marshallmedicalcenters_standardcharges.csv
200
https://namccares.com/sites/namc/assets/uploads/CDM%20-%20North%20Alabama%20MC%2023200.xlsx
406
https://healthcare.ascension.org/-/media/project/ascension/healthcare/price-transparency-files/al/630578923_ascension-saint-vincents-east_standardcharges.xlsx
403
https://dekalbregional.com/wp-content/uploads/2022/04/DeKalb_CMList_01_10_2022.csv
200
https://cdn.hs.uab.edu/static/46-1468253_callahaneyehospital_standardcharges.zip
404
https://www.huntsvillehospital.org/images/pricetransparency/472323163_Helen-Keller_standardcharges.xls
200
https://www.dalemedical.org/docs/Chargemaster.xlsx
200
https://res.cloudinary.com/baptisthealth/raw/upload/v1659101684/BaptistJax/price-transparency/590747311_baptist-medical-center-south_standardcharges_xml.csv
200
https://pricetransparency.blob.co