In [None]:
import psycopg2
import requests
import pandas as pd
import io
import urllib3
import csv

First we need to establish the database connection. If you haven't already, get it running using docker compose

In [2]:
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432",
)

cur = conn.cursor()

Now to make any requests to the database for data we can use cur.execute()

In [3]:
cur.execute("SELECT * FROM machine_links INNER JOIN features ON machine_links.ccn = features.ccn WHERE cardinality(csv_headers) is NULL") # query to get joined data
res = cur.fetchall()
df = pd.DataFrame(res)

df = df.iloc[:, 0:9] # remove the double ccn column
df.columns = ["ccn", "state_or_region", "reporting_entity_name_common", "machine_readable_url", "csv_headers", "meets_standard", "bed_count", "zip_code", "medicare_medicaid_eligible"]

df.set_index("ccn", inplace=True) # index by ccn
df

Unnamed: 0_level_0,state_or_region,reporting_entity_name_common,machine_readable_url,csv_headers,meets_standard,bed_count,zip_code,medicare_medicaid_eligible
ccn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
012014,AL,North Alabama Specialty Hospital,https://28g1xh366uy0x9ort41hns72-wpengine.netd...,,False,31,35611,True
040051,AR,Drew Memorial Health System,https://drewmemorial.simplefocus.dev/content/u...,,True,60,71655,True
050077,CA,Scripps Mercy Hospital,https://apps.scripps.org/pricetransparency/951...,,True,700,92103,True
050100,CA,Sharp Memorial Hospital,https://images.sharp.com/com/patient/billing/9...,,True,881,92123,True
050180,CA,John Muir Medical Center - Walnut Creek Campus,https://www.johnmuirhealth.com/content/dam/jmh...,,True,296,94598,True
...,...,...,...,...,...,...,...,...
673074,TX,Everest Rehabilitation Hospital Temple,https://www.everestrehab.com/Everest_Rehabilit...,,True,36,76504,True
673076,TX,Clearsky Rehabilitation Hospital Of Flower Mound,https://clearskyhealth.com/flowermound/wp-cont...,,False,41,75022,True
673078,TX,"Shannon Rehabilitation Hospital, An Affiliate ...",https://encompasshealth.com/-/media/healthsout...,,True,40,76901,True
673079,TX,Encompass Health Rehabilitation Hospital Of Waco,https://encompasshealth.com/-/media/healthsout...,,True,40,76706,True


To add some more insight to the provided url, we can parse the file to see which headers are provided by each hospital

In [4]:
def get_headers(f):
    # f: csv file object
    # returns list of csv headers
    f.readline()
    d_reader = csv.DictReader(f)
    headers = d_reader.fieldnames

    return headers


def parse_mrf(url):
    # url: link to machine readable file
    response = requests.get(url)
    print(response.text)
    return get_headers(response)

In [5]:
for row in res:
    session = requests.Session()
    retry = urllib3.util.retry.Retry(connect=3, backoff_factor=.5, raise_on_status=False)
    adapter = requests.adapters.HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    print(row[3])
    try:
        r = session.get(row[3])
    except:
        continue
    print(r.status_code)
    ccn = row[0]
    if str(r.status_code)[0] == str(4) or str(r.status_code)[0] == str(5) or r.headers['Content-Type'] in ["application/pdf", "application/zip", "application/json"]:
        cur.execute(
            "UPDATE machine_links SET csv_headers=array_append(csv_headers, %s) WHERE ccn=%s", (str(r.status_code), str(ccn))
        )
        conn.commit()
        continue
    if row[3][-4:] == "xlsx":
        headers = list(pd.read_excel(r.content, engine='openpyxl').columns)
    elif row[3][-3:] == "xls":
        headers = list(pd.read_excel(r.content, engine="xlrd").columns)
    elif row[3][-3:] == "csv":
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(r.content[0])
        print(dialect.delimiter)
        headers = list(pd.read_csv(io.StringIO(r.content.decode('utf-8', errors="ignore")), delimiter=dialect.delimiter).columns)
    elif row[3][-3:] == "zip":
        cur.execute(
            "UPDATE machine_links SET csv_headers=array_append(csv_headers, %s) WHERE ccn=%s", (str(r.status_code), str(ccn))
        )
        conn.commit()        
        continue
    cur.execute(
        "UPDATE machine_links SET csv_headers=%s WHERE ccn=%s", (headers, ccn, )
    )
    conn.commit()

https://28g1xh366uy0x9ort41hns72-wpengine.netdna-ssl.com/wp-content/uploads/2021/12/North-Alabama-Price-Transparency-01-2022.xlsx
https://drewmemorial.simplefocus.dev/content/uploads/2020/12/352414105-Drew-Memorial-Health-System-Standard-Charges-1.xlsx
https://apps.scripps.org/pricetransparency/951684089_ScrippsMercyHospitalSanDiego_standardcharges.csv
