In [None]:
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from tqdm import tqdm
import os

Last scraped on Jun 13, 2023.

### Read index

In [None]:
# read index
# url='https://www.mdcalc.com/#All'
# req=requests.get(url)
# content=req.text
# with open('../data/index.html', 'w') as f:
#     f.write(content)

# extract out df
index = open('../data/index.html', 'r').read()
soup = BeautifulSoup(index)
tab = pd.read_table('../data/index.html')
scripts = soup.find_all('script')
d = json.loads(scripts[-1].text)
all_calcs_list = d['props']['pageProps']['allCalcs']
df = pd.DataFrame.from_dict(all_calcs_list)

# clean df
def clean_element(x):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0]
    if isinstance(x, str):
        return x.replace('<span>', '').replace('</span>', '')
    return x
df = df.applymap(clean_element)
df['url_full'] = 'https://www.mdcalc.com/calc/' + df['id'].astype(str) + '/' + df['slug']

### Read individual pages
Note: don't run multiple times!

In [None]:
for i in tqdm(range(df.shape[0])): #df.shape[0])):
    slug = df['slug'].iloc[i]
    url_full = df['url_full'].iloc[i]
    output_fname = f'../data/pages/{slug}.html'

    if not os.path.exists(output_fname):
        req = requests.get(url_full)
        content = req.text
        with open(output_fname, 'w') as f:
            f.write(content)
            print(slug, url_full)
assert len(os.listdir('../data/pages')) == df.shape[0]

### parse individual pages

In [None]:
errors = []
for i in tqdm(range(df.shape[0])):
    slug = df.iloc[i].slug
    fname = '../data/pages/' + slug + '.html'
    fname_json = '../data/processed/' + slug + '.json'
    if not os.path.exists(fname_json):
        html = open(fname, 'r').read()
        soup = BeautifulSoup(html)
        try:
            data = soup.find_all('script')[2]
            s = json.loads(data.text[data.text.index('{'):])['calc']
            with open(f'../data/processed/{slug}.json', 'w') as f:
                json.dump(s, f)
        except:
            # print('failed', slug)
            errors.append(slug)
print('num errors', len(errors))
df = df[df.slug.isin(errors) == False]
print(df.shape)

In [None]:
df.iloc[0]

### merge and clean

In [55]:
# load cdis
cdis = [
    json.loads(open(f"../data/processed/{df.iloc[i].slug}.json", "r").read())
    for i in tqdm(range(df.shape[0]))
]
cdis = pd.DataFrame.from_dict(cdis)
# cdis['num_rules'] = cdis['input_schema'].apply(len)  # num rules (this includes some messiness that isn't actually a rule)

# merge with df
df_merged = df.join(cdis, rsuffix="_duplicate")  # mark duplicate cols
df_merged = df_merged.drop(
    columns=[k for k in df_merged.columns if k.endswith("_duplicate")]
)  # drop the duplicates

# clean list-valued columns
# list valued cols should be stored as lists, not strings that look like lists
LIST_VALUED_COLS = [
    "disease_en",
    "system_en",
    "purpose_en",
    "chief_complaint_en",
    "specialty_en",
]


def clean_list_valued_string(s):
    if isinstance(s, list):
        return s
    elif s is None or pd.isna(s):
        return []
    elif isinstance(s, str) and s.startswith("[") and s.endswith("]"):
        return s[1:-1].replace("'", "").replace('"', "").split(", ")
    elif isinstance(s, str):
        return [s]


for col in LIST_VALUED_COLS:
    # print(df_merged[col])
    df_merged[col] = df_merged[col].apply(clean_list_valued_string)
    assert np.all(
        df_merged[col].apply(lambda x: isinstance(x, list))
    ), "all values are lists"


# add feature_names
def get_feature_names_list(schema):
    if isinstance(schema, list):
        return [s["label_en"] if "label_en" in s else "unknown" for s in schema]
    else:
        return []


df_merged["feature_names"] = df_merged["input_schema"].apply(get_feature_names_list)


def try_or_none(func):
    def wrapper(*args):
        try:
            return func(*args)
        except:
            return None
    return wrapper


@try_or_none
def get_refs(row):
    return row["content"]["about"]["references_list"]

@try_or_none
def get_ref_original(row):
    return row['Original/Primary Reference']

@try_or_none
def get_text(row):
    return row['text']




df_merged["refs"] = df_merged.apply(get_refs, axis=1)
df_merged['ref_original'] = df_merged['refs'].apply(get_ref_original)
df_merged['ref_original_text'] = 


df_merged.to_pickle("../data/cdis_with_schemas.pkl")
df_merged.to_csv("../data/cdis_with_schemas.csv")

100%|██████████| 690/690 [00:00<00:00, 1620.14it/s]


In [53]:
df_merged['ref_original'].iloc[0]

[{'text': 'CPT® Evaluation and Management (E/m) Code and Guideline Changes. American Medical Association, 2022. ',
  'href': 'https://www.ama-assn.org/system/files/2023-e-m-descriptors-guidelines.pdf'}]

In [None]:
# display all columnswith no max_width
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 0): 
    display(df_merged.iloc[1])
    