In [9]:
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from tqdm import tqdm
import os

Last scraped on Jun 23, 2022. One CDI ("refractory-epilepsy") was not successfully scraped, resulting in data on 644 CDIs.

# read index

In [10]:
# url='https://www.mdcalc.com/#all'
# req=requests.get(url)
# content=req.text
#
# with open('../data/index.html', 'w') as f:
#     f.write(content)

In [25]:
index = open('../data/index.html', 'r').read()
soup = BeautifulSoup(index)
# tab = pd.read_table('../data/index.html')

# scripts = soup.find_all('script')
# with open('../data/preloaded_state.json', 'w') as f:
#     f.write(scripts[0].text)
# manually delete first line with state = ...
# only keep "calcs"
# delete "newest calcs" from bottom

clean json and convert to pandas

In [102]:
df = pd.read_json('../data/preloaded_state.json').infer_objects()


# # json.load(open('data/preloaded_state.json', 'r'))
def clean_element(x):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0]
    if isinstance(x, str):
        return x.replace('<span>', '').replace('</span>', '')
    return x


df = df.applymap(clean_element)
df.to_csv('../data/index.csv', index=False)

In [2]:
df = pd.read_csv('../data/index.csv')
df['url_full'] = 'https://www.mdcalc.com/calc/' + df['favorite_id'].astype(str) + '/' + df['url']
print(df.shape)
df = df[~df['url'].str.startswith('refractory-epilepsy')]
print('after dropping refractory-epilepsy', df.shape)

(645, 17)
after dropping refractory-epilepsy (644, 17)


In [129]:
# note: this actually hits a bunch of web-pages -- don't run multiple times!
for i in tqdm(range(df.shape[0])):
    url = df['url'].iloc[i]
    url_full = df['url_full'].iloc[i]
    output_fname = f'../data/pages/{url}.html'

    if not os.path.exists(output_fname):
        req = requests.get(url_full)
        content = req.text
        with open(output_fname, 'w') as f:
            f.write(content)
            print(url, url_full)
assert len(os.listdir('../data/pages')) == df.shape[0]

100%|██████████| 644/644 [00:00<00:00, 78892.80it/s]


## parse individual pages

In [173]:
for i in tqdm(range(df.shape[0])):
    url = df.iloc[i].url
    fname = '../data/pages/' + url + '.html'
    html = open(fname, 'r').read()
    soup = BeautifulSoup(html)
    data = soup.find_all('script')[0]
    s = json.loads(data.text[data.text.index('{'):])['calc']
    with open(f'../data/processed/{url}.json', 'w') as f:
        json.dump(s, f)

100%|██████████| 644/644 [00:16<00:00, 40.17it/s]


# merge into big df

In [3]:
cdis = [json.loads(open(f'../data/processed/{df.iloc[i].url}.json', 'r').read())
        for i in tqdm(range(df.shape[0]))]
cdis = pd.DataFrame.from_dict(cdis)
cdis['num_rules'] = cdis['input_schema'].apply(len)  # num rules

100%|██████████| 644/644 [00:00<00:00, 3482.89it/s]


In [8]:
df_merged = df.join(cdis, rsuffix='_duplicate')  # mark duplicate cols
df_merged = df_merged.drop(columns=[k for k in df_merged.columns if k.endswith('_duplicate')])  # drop the duplicates
# df_merged.to_csv('data/cdis_with_schemas.csv')
# df_merged.to_pickle('data/cdis_with_schemas.pkl')

Clean list-valued columns

In [10]:
df_merged = pd.read_pickle('../data/cdis_with_schemas.pkl')

# list valued cols should be stored as lists, not strings that look like lists
LIST_VALUED_COLS = ['disease_en', 'system_en', 'purpose_en', 'chief_complaint_en', 'specialty_en',
                    "expert_name", "creators", "search_abbreviation_en"]


def clean_list_valued_string(s: str):
    if not s or pd.isna(s):
        return []
    if s.startswith('[') and s.endswith(']'):
        return s[1:-1].replace("'", "").replace('"', '').split(', ')
    else:
        return [s]


for col in LIST_VALUED_COLS:
    df_merged[col] = df_merged[col].apply(clean_list_valued_string)
    assert np.all(df_merged[col].apply(lambda x: isinstance(x, list))), 'all values are lists'
df_merged.to_pickle('../data/cdis_with_schemas.pkl')
df_merged.to_csv('../data/cdis_with_schemas.csv')

Add feature names

In [12]:
df_merged = pd.read_pickle('../data/cdis_with_schemas.pkl')


def get_feature_names_list(schema):
    if isinstance(schema, list):
        return [s['label_en'] if 'label_en' in s else 'unknown'
                for s in schema]
    else:
        return []


df_merged['feature_names'] = df_merged['input_schema'].apply(get_feature_names_list)

# df_merged.to_pickle('data/cdis_with_schemas.pkl')
# df_merged.to_csv('data/cdis_with_schemas.csv')

In [13]:
df_merged['feature_names']

0      [<p>Alertness</p>, <p><calculator id="3995">AM...
1      [Age, Sex, Chronic respiratory disease, Heart ...
2      [Age, years , Sex at birth, Number of comorbid...
3      [Thrombocytopenia, Timing of platelet count fa...
4            [Sex, Height, Age, Weight, Distance walked]
                             ...                        
640                                        [Bicarbonate]
641    [unknown, Any loose or watery stools, Any vomi...
642                              [<p>Classification</p>]
643    [Pregnant patient, unknown, Clinical signs of ...
644                                                   []
Name: feature_names, Length: 644, dtype: object