In [1]:
import psycopg2
import requests
import pandas as pd
import io
import urllib3

First we need to establish the database connection. If you haven't already, get it running using docker compose

In [2]:
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432",
)

cur = conn.cursor()

Now to make any requests to the database for data we can use cur.execute()

In [3]:
cur.execute("SELECT * FROM machine_links INNER JOIN features ON machine_links.ccn = features.ccn WHERE cardinality(csv_headers) is NULL") # query to get joined data
res = cur.fetchall()
df = pd.DataFrame(res)

df = df.iloc[:, 0:9] # remove the double ccn column
df.columns = ["ccn", "state_or_region", "reporting_entity_name_common", "machine_readable_url", "csv_headers", "meets_standard", "bed_count", "zip_code", "medicare_medicaid_eligible"]

df.set_index("ccn", inplace=True) # index by ccn
df

Unnamed: 0_level_0,state_or_region,reporting_entity_name_common,machine_readable_url,csv_headers,meets_standard,bed_count,zip_code,medicare_medicaid_eligible
ccn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
010011,AL,Saint Vincent's East,https://healthcare.ascension.org/-/media/proje...,,False,362,35235,True
010012,AL,Dekalb Regional Medical Center,https://dekalbregional.com/wp-content/uploads/...,,True,134,35968,True
010019,AL,Helen Keller Memorial Hospital,https://www.huntsvillehospital.org/images/pric...,,False,185,35660,True
010029,FL,East Alabama Medical Center,https://www.eastalabamahealth.org/assets/docum...,,True,429,36801,True
010035,AL,Cullman Regional Medical Center,https://cullmansite.wpenginepowered.com/wp-con...,,False,115,35058,True
...,...,...,...,...,...,...,...,...
673074,TX,Everest Rehabilitation Hospital Temple,https://www.everestrehab.com/Everest_Rehabilit...,,True,36,76504,True
673076,TX,Clearsky Rehabilitation Hospital Of Flower Mound,https://clearskyhealth.com/flowermound/wp-cont...,,False,41,75022,True
673078,TX,"Shannon Rehabilitation Hospital, An Affiliate ...",https://encompasshealth.com/-/media/healthsout...,,True,40,76901,True
673079,TX,Encompass Health Rehabilitation Hospital Of Waco,https://encompasshealth.com/-/media/healthsout...,,True,40,76706,True


To add some more insight to the provided url, we can parse the file to see which headers are provided by each hospital

In [4]:
def get_headers(f):
    # f: csv file object
    # returns list of csv headers
    f.readline()
    d_reader = csv.DictReader(f)
    headers = d_reader.fieldnames

    return headers


def parse_mrf(url):
    # url: link to machine readable file
    response = requests.get(url)
    print(response.text)
    return get_headers(response)

In [5]:
for row in res:
    session = requests.Session()
    retry = urllib3.util.retry.Retry(connect=3, backoff_factor=.5, raise_on_status=False)
    adapter = requests.adapters.HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    try:
        r = session.get(row[3])
    except:
        continue
    ccn = row[0]
    if str(r.status_code)[0] == str(4) or str(r.status_code)[0] == str(5) or r.headers['Content-Type'] in ["application/pdf", "application/zip", "application/json"]:
        continue
    print(r.status_code)
    print(row[3])
    if row[3][-4:] == "xlsx":
        print(0)
        headers = list(pd.read_excel(r.content, engine='openpyxl').columns)
    elif row[3][-3:] == "xls":
        print(1)
        headers = list(pd.read_excel(r.content, engine="xlrd").columns)
    elif row[3][-3:] == "csv":
        print(2)
        if row[3] in ["https://app.peacehealth.org/FileRepository/Financial/StandardCharges/KEMC/92-0016490_Ketchikan_Medical_Center_StandardCharges.csv", "https://apps.scripps.org/pricetransparency/951684089_ScrippsMercyHospitalSanDiego_standardcharges.csv"]:
            headers = list(pd.read_csv(io.StringIO(r.content.decode('utf-8', errors="ignore")), delimiter="|").columns)
        else:
            headers = list(pd.read_csv(io.StringIO(r.content.decode('utf-8', errors="ignore"))).columns)
    elif row[3][-3:] == "zip":
        continue
    cur.execute(
        "UPDATE machine_links SET csv_headers=%s WHERE ccn=%s", (headers, ccn, )
    )
    conn.commit()

200
https://s3.amazonaws.com/ycubaa-production-marlin-1-charge-management-public/facilities/3da9ca85-20c5-41a0-bdcd-1c72cbc6cce2/716042625_MCHS-GREAT%20RIVER%20MEDICAL%20CENTER_standardcharges.zip
200
https://sites.bmhcc.org/-/media/Pricing-Documents/AR/26-1214372_NEA-Baptist_STANDARDCHARGES.zip
200
https://sites.bmhcc.org/-/media/Pricing-Documents/AR/82-3844150_BMH-Crittenden_STANDARDCHARGES.zip
