In [111]:
%load_ext autoreload
%autoreload 2

import genanki
import webscrape as ws
import anki_utils as au
import numpy as np

import constants
import dill as pickle

In [6]:
import requests
import bs4
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

In [78]:
def exclude_items(items, exclude_items):
    """"""
    
    return [item for item in items if not any(e_i in item for e_i in exclude_items)]

In [80]:
exclude_items(tags, ["Summaries"])

['Emergency Medicine', 'Intensive Care Medicine']

In [32]:
SUMMARIES_LIST_GENERAL_URL = "https://www.thebottomline.org.uk/category/summaries/page/{page_number}/"

In [346]:
def get_summary_urls(summaries_list_general_url):
    """"""

    all_summaries = []
    for page_index in tqdm(range(1, 50)):
        summary_list_page_url = summaries_list_general_url.format(page_number=page_index)
        response = requests.get(summary_list_page_url)
        if response.status_code != 200:
            break
        page_soup = BeautifulSoup(response.text, 'html.parser')
        page_summaries = page_soup.find_all("h2", class_="entry-title post-title")
        all_summaries += [summary.find("a").attrs["href"] for summary in page_summaries]
    
    return all_summaries

In [347]:
summary_urls = get_summary_urls(SUMMARIES_LIST_GENERAL_URL)

  0%|          | 0/49 [00:00<?, ?it/s]

In [352]:
class Trial:
    def __init__(self, url):
        """"""
        self.url = url
        self.soup = BeautifulSoup(requests.get(url).text, 'html.parser')

In [353]:
trials = [Trial(summary_url) for summary_url in tqdm(summary_urls)]

  0%|          | 0/451 [00:00<?, ?it/s]

In [355]:
trial = trials[0]

In [358]:
def get_tags(summary_soup):
    """"""
    tag_items = summary_soup.find(class_="meta-category").find_all("a")
    tags = [tag_item.contents[0] for tag_item in tag_items]
    return exclude_items(tags, ["Summaries"])

get_tags(trial.soup)

['Emergency Medicine', 'Intensive Care Medicine']

In [369]:
def get_trial_year(summary_soup):
    """"""
    for part in summary_soup.find_all("p"):
        line = part.text.lower()
        if "doi" in line:
            try:
                return int("".join(char for char in line.split("doi")[0] if char.isdigit()))
            except:
                return None
            break

In [370]:
for trial in tqdm(trials):
    get_trial_year(trial.soup)

  0%|          | 0/451 [00:00<?, ?it/s]

In [None]:
trial.soup.find_all("h3")[0]

In [381]:
trial.soup.find_all("h3")[0].span

<span style="font-size: large; font-weight: bold;">Clinical Question</span>

In [92]:
titles = []

for summary in tqdm(summaries):
    response = requests.get(summary.find("a").attrs["href"])
    summary_soup = BeautifulSoup(response.text, 'html.parser')
    title = summary_soup.find("h1")
    titles.append(title)

  0%|          | 0/451 [00:00<?, ?it/s]

In [290]:
CUSTOM_TRIAL_NAMES = ["CandiSep", "PreVent", "TracMan", "PlasmAr", "Pandemic Triage", "Reconnect"]

CUSTOM_TRIAL_NAME_MAP = {
    "Acetazolamide in Acute Decompensated Heart Failure with Volume Overload": "ADVOR",
    "Platelet Transfusion before CVC Placement in Patients with Thrombocytopenia": "PACER",
    "Emergency Department Resuscitative Endovascular Balloon Occlusion of the Aorta in Trauma Patients With Exsanguinating Hemorrhage": "UK-REBOA",
    "Aggressive or Moderate Fluid Resuscitation in Acute Pancreatitis": "WATERFALL",
    "Early and Empirical High-Dose Cryoprecipitate for Hemorrhage After Traumatic Injury": "CRYOSTAT-2"
}

def get_trial_name(title):
    """"""
    count = sum([c.isupper() for c in title])
    length = len(title)
    mean = count / length
    
    if mean >= 0.3 and length > 3:
        return title
    
    if "trial" in title:
        return title.split(" trial")[0]
    
    if "Trial" in title:
        return title.split(" Trial")[0]
    
    if "(" in title:
        return title
    
    if count >= 3 and length < 60:
        return title
    
    if ":" in title:
        return title.split(":")[0]
    
    if "–" in title:
        return title.split("–")[0]
    
    if any(c.isdigit() for c in title):
        return title
    
    if title in CUSTOM_TRIAL_NAMES:
        return title
    
    if title in CUSTOM_TRIAL_NAME_MAP:
        return CUSTOM_TRIAL_NAME_MAP[title]
    
    if length > 60:
        return title

    return False

In [284]:
clean_titles = []
failed_titles = []

for title in titles:
    title_clean = get_trial_name(title.text)
    if title_clean:
        clean_titles.append(title_clean)
    else:
        count = np.sum([c.isupper() for c in title.text])
        mean = count / len(title.text)
        failed_titles.append(f"{count} {len(title.text)} " + title.text)
        
        if title.text in CUSTOM_TRIAL_NAME_MAP:
            print(title.text, count, len(title.text))

In [286]:
sorted(failed_titles)[::-1]

['2 8 McCardle',
 '2 8 El Adawi',
 '2 8 De Jonge',
 '2 7 Xian Su',
 '2 7 McCourt',
 '2 3 SoM',
 '2 17 Kleine-Brueggeney',
 '2 16 Garrouste-Orgeas',
 '2 14 Kentish-Barnes',
 '2 14 Erythro-Emerge',
 '2 13 Lars Anderson',
 '1 9 Nishikimi',
 '1 9 Hernández',
 '1 9 Determann',
 '1 9 Choudhury',
 '1 8 Weingart',
 '1 8 Kacmarek',
 '1 8 Hagihara',
 '1 8 Franklin',
 '1 8 Bradford',
 '1 8 Boulware',
 '1 8 Bouderka',
 '1 8 Anderson',
 '1 7 Zochios',
 '1 7 Ventura',
 '1 7 Skrobik',
 '1 7 Sjöblom',
 '1 7 Seymour',
 '1 7 Sessler',
 '1 7 Sellick',
 '1 7 Pivetta',
 '1 7 Philpot',
 '1 7 Pascall',
 '1 7 Niemann',
 '1 7 Morelli',
 '1 7 Meacock',
 '1 7 Maltese',
 '1 7 Loubani',
 '1 7 Heyland',
 '1 7 Ferrari',
 '1 7 Demoule',
 '1 7 Bickell',
 '1 7 Bellomo',
 '1 6 Yeatts',
 '1 6 Woolum',
 '1 6 Vollam',
 '1 6 Uranga',
 '1 6 Torres',
 '1 6 Suzuki',
 '1 6 Subira',
 '1 6 Stockl',
 '1 6 Simons',
 '1 6 Sadfar',
 '1 6 Rivers',
 '1 6 Riskin',
 '1 6 Morris',
 '1 6 Monnet',
 '1 6 Meduri',
 '1 6 Meduri',
 '1 6 Mahler'

In [289]:
len(clean_titles)

348

In [93]:
titles

[<h1 class="entry-title post-title">DanGer Shock – Microaxial Flow Pump in Infarct-Related Cardiogenic Shock</h1>,
 <h1 class="entry-title post-title">NICO – Noninvasive Airway Management of Comatose Patients with Acute Poisoning</h1>,
 <h1 class="entry-title post-title">Validation of the MIRACLE 2 Score for Prognostication After OOHCA</h1>,
 <h1 class="entry-title post-title">ARiE Trial – Rifaximin for encephalopathy</h1>,
 <h1 class="entry-title post-title">PROPHY-VAP: Ceftriaxone to prevent early ventilator-associated pneumonia</h1>,
 <h1 class="entry-title post-title">EARLYDRAIN – Lumbar Cerebrospinal Fluid Drain Among Patients With aSAH</h1>,
 <h1 class="entry-title post-title">ACORN</h1>,
 <h1 class="entry-title post-title">STRESS-L: Landiolol and Organ Failure in Patients With Septic Shock</h1>,
 <h1 class="entry-title post-title">Emergency Department Resuscitative Endovascular Balloon Occlusion of the Aorta in Trauma Patients With Exsanguinating Hemorrhage</h1>,
 <h1 class="ent

In [1]:
response = requests.get(self.url)
self.page_soup = BeautifulSoup(response.text, 'html.parser')
self.sections = [Section(section) for section in self.page_soup.find_all("span", class_="mw-headline")]
self.section_titles, new_sections = [], []

NameError: name 'requests' is not defined