In [None]:
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from tqdm import tqdm
import os

Last scraped on Jun 13, 2023. Output of this notebook is `../data/cdis_with_schemas_cleaned.pkl` (intermediate output is `../data/cdis_with_schemas.pkl`). Also outputs `../data/main.csv`

### Read index

In [None]:
# read index
# url='https://www.mdcalc.com/#All'
# req=requests.get(url)
# content=req.text
# with open('../data/index.html', 'w') as f:
#     f.write(content)

# extract out df
index = open('../data/index.html', 'r').read()
soup = BeautifulSoup(index)
tab = pd.read_table('../data/index.html')
scripts = soup.find_all('script')
d = json.loads(scripts[-1].text)
all_calcs_list = d['props']['pageProps']['allCalcs']
df = pd.DataFrame.from_dict(all_calcs_list)

# clean df
def clean_element(x):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0]
    if isinstance(x, str):
        return x.replace('<span>', '').replace('</span>', '')
    return x
df = df.applymap(clean_element)
df['url_full'] = 'https://www.mdcalc.com/calc/' + df['id'].astype(str) + '/' + df['slug']

### Read individual pages
Note: this actually does the scraping and saves `.html` files. Don't run multiple times (unless cached files are present).

In [None]:
for i in tqdm(range(df.shape[0])): #df.shape[0])):
    slug = df['slug'].iloc[i]
    url_full = df['url_full'].iloc[i]
    output_fname = f'../data/pages/{slug}.html'

    if not os.path.exists(output_fname):
        req = requests.get(url_full)
        content = req.text
        with open(output_fname, 'w') as f:
            f.write(content)
            print(slug, url_full)
assert len(os.listdir('../data/pages')) == df.shape[0]

### Parse individual pages
`.html` -> `.json`

In [None]:
errors = []
for i in tqdm(range(df.shape[0])):
    slug = df.iloc[i].slug
    fname = '../data/pages/' + slug + '.html'
    fname_json = '../data/processed/' + slug + '.json'
    if not os.path.exists(fname_json):
        html = open(fname, 'r').read()
        soup = BeautifulSoup(html)
        try:
            data = soup.find_all('script')[2]
            s = json.loads(data.text[data.text.index('{'):])['calc']
            with open(f'../data/processed/{slug}.json', 'w') as f:
                json.dump(s, f)
        except:
            # print('failed', slug)
            errors.append(slug)
print('num errors', len(errors))
df = df[df.slug.isin(errors) == False]
print(df.shape)

### Merge metadata from index and individual pages

In [None]:
# load cdis
cdis = [
    json.loads(open(f"../data/processed/{df.iloc[i].slug}.json", "r").read())
    for i in tqdm(range(df.shape[0]))
]
cdis = pd.DataFrame.from_dict(cdis)
# cdis['num_rules'] = cdis['input_schema'].apply(len)  # num rules (this includes some messiness that isn't actually a rule)

# merge with df
df_merged = df.join(cdis, rsuffix="_duplicate")  # mark duplicate cols
df_merged = df_merged.drop(
    columns=[k for k in df_merged.columns if k.endswith("_duplicate")]
)  # drop the duplicates

df_merged.to_pickle("../data/cdis_with_schemas.pkl")
df_merged.to_csv("../data/cdis_with_schemas.csv")

# Clean the cdis

In [None]:
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import mdcalc
from mdcalc import try_or_none
df = pd.read_pickle("../data/cdis_with_schemas.pkl")
df = df.sort_values(by='id')

In [None]:
@try_or_none
def get_refs(row):
    return row["content"]["about"]["references_list"]

@try_or_none
def get_ref_original(row):
    return row['Original/Primary Reference']

@try_or_none
def get_text(row):
    return row[0]['text']

@try_or_none
def get_href(row):
    return row[0]['href']

@try_or_none
def get_year_from_str(s: str):
    # search for a 4 digit number that between 1900 and 2023

    match = re.search(r'(?<!\d)(19\d{2}|20[01]\d|202[0-3])(?!\d)', s)
    if match:
        return int(match.group())
    return None

In [None]:
# clean list-valued columns
df = mdcalc.clean_list_valued_strings(df)

# add feature_names
def get_feature_names_list(schema):
    if isinstance(schema, list):
        return [s["label_en"] if "label_en" in s else "unknown" for s in schema]
    else:
        return []


df["feature_names"] = df["input_schema"].apply(get_feature_names_list)
df["refs"] = df.apply(get_refs, axis=1)
df['ref_original'] = df['refs'].apply(get_ref_original)
df['ref_href'] = df['ref_original'].apply(get_href)
df['ref_text'] = df['ref_original'].apply(get_text)
df['ref_year'] = df['ref_text'].apply(get_year_from_str)

df.to_pickle("../data/cdis_with_schemas_cleaned.pkl")
df.to_csv("../data/cdis_with_schemas_cleaned.csv")

In [None]:
# display all columnswith no max_width
cols = ['id', 'full_title_en', 'short_description_en', 'ref_text', 'ref_href', 'ref_year']
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 0): 
    display(df[cols].head(5))
    # display(df[df['ref_year'].min() == df['ref_year']][['ref_text', 'ref_href']])
df[cols].to_csv('../data/main.csv', index=False)