In [168]:
%load_ext autoreload
%autoreload 2
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import mdcalc
from mdcalc import try_or_none

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Last scraped on Jun 13, 2023. Output of this notebook is `../data/cdis_with_schemas_cleaned.pkl` (intermediate output is `../data/cdis_with_schemas.pkl`). Also outputs `../data/main.csv`

### Read index

In [169]:
# read index
# url='https://www.mdcalc.com/#All'
# req=requests.get(url)
# content=req.text
# with open('../data/index.html', 'w') as f:
#     f.write(content)

# extract out df
index = open('../data/index.html', 'r').read()
soup = BeautifulSoup(index)
tab = pd.read_table('../data/index.html')
scripts = soup.find_all('script')
d = json.loads(scripts[-1].text)
all_calcs_list = d['props']['pageProps']['allCalcs']
df = pd.DataFrame.from_dict(all_calcs_list)

# clean df
def clean_element(x):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0]
    if isinstance(x, str):
        return x.replace('<span>', '').replace('</span>', '')
    return x
df = df.applymap(clean_element)
df['url_full'] = 'https://www.mdcalc.com/calc/' + df['id'].astype(str) + '/' + df['slug']

### Read individual pages
Note: this actually does the scraping and saves `.html` files. Don't run multiple times (unless cached files are present).

In [None]:
for i in tqdm(range(df.shape[0])): #df.shape[0])):
    slug = df['slug'].iloc[i]
    url_full = df['url_full'].iloc[i]
    output_fname = f'../data/pages/{slug}.html'

    if not os.path.exists(output_fname):
        req = requests.get(url_full)
        content = req.text
        with open(output_fname, 'w') as f:
            f.write(content)
            print(slug, url_full)
assert len(os.listdir('../data/pages')) == df.shape[0]

### Parse individual pages
`.html` -> `.json`

In [None]:
errors = []
for i in tqdm(range(df.shape[0])):
    slug = df.iloc[i].slug
    fname = '../data/pages/' + slug + '.html'
    fname_json = '../data/processed/' + slug + '.json'
    if not os.path.exists(fname_json):
        html = open(fname, 'r').read()
        soup = BeautifulSoup(html)
        try:
            data = soup.find_all('script')[2]
            s = json.loads(data.text[data.text.index('{'):])['calc']
            with open(f'../data/processed/{slug}.json', 'w') as f:
                json.dump(s, f)
        except:
            # print('failed', slug)
            errors.append(slug)
print('num errors', len(errors))
df = df[df.slug.isin(errors) == False]
print(df.shape)

### Merge metadata from index and individual pages

In [170]:
# load cdis
cdis = []
idxs = []  # keep track of idxs for which json file was present and succesfully loaded
for i in tqdm(range(df.shape[0])):
    try:
        cdis.append(
            json.loads(open(f"../data/processed/{df.iloc[i].slug}.json", "r").read())
        )
        idxs.append(i)
    except:
        print(i, df.iloc[i].slug)
        cdis.append(None)
cdis = np.array(cdis)[np.array(idxs)].tolist()
cdis = pd.DataFrame.from_dict(cdis)
# print("shapes", df.shape, cdis.shape, df.iloc[idxs].shape)
# cdis['num_rules'] = cdis['input_schema'].apply(len)  # num rules (this includes some messiness that isn't actually a rule)

  0%|          | 0/692 [00:00<?, ?it/s]

282 history-electrocardiogram-manchester-acute-coronary-syndromes-macs


100%|██████████| 692/692 [00:00<00:00, 2095.97it/s]

392 mulbsta-score-viral-pneumonia-mortality





In [171]:
# merge with df
df_merged = (
    df.iloc[idxs].reset_index().join(cdis, rsuffix="_duplicate")
)  # mark duplicate cols
df_merged = df_merged.drop(
    columns=[k for k in df_merged.columns if k.endswith("_duplicate")]
)  # drop the duplicates
df_merged["input_schema"] = cdis["input_schema"].values

df_merged.to_pickle("../data/cdis_with_schemas.pkl")
df_merged.to_csv("../data/cdis_with_schemas.csv")

# Clean the cdis

In [173]:
df = pd.read_pickle("../data/cdis_with_schemas.pkl")
df = df.sort_values(by='id')

In [174]:
@try_or_none
def get_refs(row):
    return row["content"]["about"]["references_list"]

@try_or_none
def get_ref_original(row):
    return row['Original/Primary Reference']

@try_or_none
def get_text(row):
    return row[0]['text']

@try_or_none
def get_href(row):
    return row[0]['href']

@try_or_none
def get_year_from_str(s: str):
    # search for a 4 digit number that between 1900 and 2023

    match = re.search(r'(?<!\d)(19\d{2}|20[01]\d|202[0-3])(?!\d)', s)
    if match:
        return int(match.group())
    return None

In [175]:
# clean list-valued columns
df = mdcalc.clean_list_valued_strings(df)

# add feature_names
def get_feature_names_list(schema):
    if isinstance(schema, list):
        return [s["label_en"] if "label_en" in s else "unknown" for s in schema]
    else:
        return []


df["feature_names"] = df["input_schema"].apply(get_feature_names_list)
df["refs"] = df.apply(get_refs, axis=1)
df['ref_original'] = df['refs'].apply(get_ref_original)
df['ref_href'] = df['ref_original'].apply(get_href)
df['ref_text'] = df['ref_original'].apply(get_text)
df['ref_year'] = df['ref_text'].apply(get_year_from_str)

df.to_pickle("../data/cdis_with_schemas_cleaned.pkl")
df.to_csv("../data/cdis_with_schemas_cleaned.csv")

In [42]:
# display all columnswith no max_width
cols = ['id', 'full_title_en', 'short_description_en', 'ref_text', 'ref_href', 'ref_year']
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 0): 
    display(df[cols].head(5))
    # display(df[df['ref_year'].min() == df['ref_year']][['ref_text', 'ref_href']])
df[cols].to_csv('../data/main.csv', index=False)

Unnamed: 0,id,full_title_en,short_description_en,ref_text,ref_href,ref_year
46,19,Absolute Neutrophil Count (ANC),Neutropenia (after chemotherapy).,"Al-Gwaiz LA, Babay HH. The diagnostic value of absolute neutrophil count, band count and morphologic changes of neutrophils in predicting bacterial infections. Med Princ Pract. 2007;16(5):344–7. doi:10.1159/000104806.",https://www.ncbi.nlm.nih.gov/pubmed/17709921,2007.0
25,23,APGAR Score,Assesses neonates 1 & 5 mins postpartum.,Apgar V. A proposal for a new method of evaluation of the newborn infant. Curr. Res. Anesth. Analg. 1953;32(4): 260–267. doi:10.1213/00000539-195301000-00041. PMID 13083014.,http://www.ncbi.nlm.nih.gov/pubmed/13083014,1953.0
85,25,Basal Energy Expenditure,Estimates minimum caloric requirements.,"Harris J, Benedict F. A biometric study of basal metabolism in man. Washington D.C. Carnegie Institute of Washington. 1919.",,1919.0
92,27,Bicarbonate Deficit,Calculates total body bicarb deficit.,Kurtz I. Acid-Base Case Studies. 2nd Ed. Trafford Publishing (2004); 68:150.,,2004.0
98,29,Body Mass Index (BMI) and Body Surface Area (BSA),"Categorizes obesity, assists some med dosing.","Gadzik J. 'How much should I weigh?' Quetelet's equation, upper weight limits, and BMI prime.Connecticut Medicine. (2006). 70 (2): 81–8. PMID 16768059.",http://www.ncbi.nlm.nih.gov/pubmed/16768059,2006.0
