In [1]:
%load_ext autoreload
%autoreload 2

import genanki
import webscrape as ws
import anki_utils as au

import constants
import dill as pickle

In [2]:
import requests
import bs4
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

In [3]:
WIKI_JOURNAL_TRIAL_LIST_URL = "https://www.wikijournalclub.org/wiki/WikiJournalClub:Usable_articles"

In [4]:
TAG_EXCLUDE_TOKENS = ["articles_using", "pages_with", "usable"]

def process_section_html_text(html_section):
    """"""
    return "".join(str(c) for c in html_section.contents)

class Section:
    def __init__(self, section_header):
        self.header = section_header
        self.info = section_header.next_element.next_element.next_element
        self.title = section_header.next_element
    
    def __repr__(self):
        return self.title.text.strip() + "\n" + self.info.text


class Trial:
    def __init__(self, link):
        self.url = "https://www.wikijournalclub.org" + link.attrs["href"]
        self.soup_link = link
        self.title, self.short_desc = link.parent.text.split(":")
        self.short_desc = self.short_desc.strip()
        self.get_sections()
        self.get_categories()
        
    def __repr__(self):
        s = self.title + ":\n"
        s += "  -" + self.short_desc + "\n"
        s += "  Bottom Line:\n\t" + self.bottom_line.text
        return s
    
    def get_sections(self):
        response = requests.get(self.url)
        self.page_soup = BeautifulSoup(response.text, 'html.parser')
        self.sections = [Section(section) for section in self.page_soup.find_all("span", class_="mw-headline")]
        self.section_titles, new_sections = [], []
        for section in self.sections:
            if not isinstance(section.info, bs4.element.Tag):
                continue
            section_title = section.title.lower().strip().replace(" ", "_").replace("outcomes", "outcome")
            section.info.html_text = process_section_html_text(section.info)
            vars(self)[section_title] = section.info
            self.section_titles.append(section_title)
            new_sections.append(section)
        self.sections = new_sections

    def get_categories(self):
        self.tags = []
        self.categories = []
        for page_link in self.page_soup.find_all("a", href=True):
            if "title" in page_link.attrs and "Category" in page_link.attrs["title"]:
                tag = page_link.text.strip().lower().replace(" ", "_")
                if any(token in tag for token in TAG_EXCLUDE_TOKENS):
                    continue
                self.categories.append(page_link.text.strip())
                self.tags.append("WJC_clinical_trials::" + tag)
        self.categories = ", ".join(self.categories)

In [5]:
def get_valid_unique_links(links):
    new_links = []
    trial_names = []

    for link in links:
        if "/wiki/" in link.attrs["href"] and list(link.attrs.keys()) == ["href", "title"]:
            if "WikiJournalClub" in link.attrs["href"] or "Usable_articles" in link.attrs["href"]:
                continue

            trial_name = link.attrs["title"]
            if trial_name in trial_names:
                continue

            trial_names.append(trial_name)
            new_links.append(link)
    return new_links

In [6]:
def get_trials():
    response = requests.get(WIKI_JOURNAL_TRIAL_LIST_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = get_valid_unique_links(soup.find_all("a", href=True))
    trials = []
    failed_links = []
    i = 0

    for link in tqdm(links, desc="Scraping clinical trials", unit=" Trials"):
        try:
            if "/wiki/" in link.attrs["href"] and list(link.attrs.keys()) == ["href", "title"]:
                if "WikiJournalClub" in link.attrs["href"] or "Usable_articles" in link.attrs["href"]:
                    continue
                trials.append(Trial(link))
        except:
            failed_links.append(link)
    return trials, failed_links

In [7]:
overwrite = False

In [8]:
if overwrite:
    with open("data/WJC_trials.pkl", "wb") as file:
        pickle.dump(trials, file)
else:
    with open("data/WJC_trials.pkl", "rb") as file:
        trials = pickle.load(file)


EOFError: Ran out of input

In [15]:
trials, failed_links = get_trials()

Scraping clinical trials:   0%|          | 0/491 [00:00<?, ? Trials/s]

# Clean Trial Information

In [18]:
class SectionSwap:
    def __init__(self, html_text = "None listed"):
        if html_text == "":
            html_text = "None listed"
        self.html_text = html_text
        self.contents = [html_text]

In [19]:
SECONDARY_OUTCOME_ALTERNATES = ["additional_outcome", "other_outcome",
                                "secondary_outcome_at_12_months",
                                "secondary_outcome_at_3_months",
                                "secondary_outcome_at_3_years"]

PRIMARY_OUTCOME_ALTERNATES = ["outcome"]
def swap_alternate_section(trial, variable = "primary_outcome", alternates = ["outcome"]):
    """"""
    if variable in vars(trial):
        return trial

    alternate_text = ""
    for alt in alternates:
        if alt in vars(trial):
            alternate_text += process_section_html_text(vars(trial)[alt])
            del vars(trial)[alt]
    
    vars(trial)[variable] = SectionSwap(alternate_text)
    print(trial.title)
    return trial

In [20]:
missing_trials = []
for i, trial in enumerate(tqdm(trials)):
    swap_alternate_section(trial, "primary_outcome", PRIMARY_OUTCOME_ALTERNATES)
    swap_alternate_section(trial, "secondary_outcome", SECONDARY_OUTCOME_ALTERNATES)
    swap_alternate_section(trial, "criticisms", "criticism")

  0%|          | 0/491 [00:00<?, ?it/s]

CAST I (1991)
EPHESUS (2003)
SADHART (2002)
SAVE (1992)
SHOCK (1999)
VA Cooperative Study (1983)
RACE II (2010)
WOSCOPS (1995)
ASCEND (Aspirin) (2018)
COPERNICUS (2002)
DIG (1997)
MERIT-HF (1999)
RALES (1999)
SCD-HeFT (2005)
SCD-HeFT (2005)
SOLVD (1991)
PATHWAY-2 (2015)
TONE (1998)
AIRTRIP (2016)
Yang-Tobin Study (1991)
Yang-Tobin Study (1991)
GI bleeding in ICU patients (1994)
PneumA (2003)
NINDS (1995)
Canadian CT Head Rule (2001)
Canadian CT Head Rule (2001)
DCCT (1993)
DCCT (1993)
STAMPEDE (2012)
STOP-NIDDM (2002)
UKPDS 33 (1998)
Pentoxifylline in Severe Alcoholic Hepatitis (2000)
Pentoxifylline in Severe Alcoholic Hepatitis (2000)
Omeprazole in Peptic Ulcer Bleeding (2000)
EINSTEIN CHOICE (2017)
Lo-Coco 2013 (2013)
Ibrutinib in Waldenstrom macroglobulinemia (2015)
PIOPED II (2006)
PIOPED II (2006)
MSH (1995)
ASPEN (2020)
APPAC (2015)
POET (2019)
IPrEx (2010)
NA-ACCORD (2009)
European Dexamethasone Study (2002)
ALMS (2009)
NASCET (1998)
Veterans Affairs Status Epilepticus Cooperati

# Anki Cards

In [21]:
import genanki
from cached_property import cached_property
import re
import genanki as ga

import os

class ModelX(genanki.Model):
    def __init__(self, model_id=None, name=None, fields=None, templates=None, css='', type=0):
        super().__init__(model_id, name, fields, templates, css)
        self._type = type

    def to_json(self, now_ts, deck_id):
        j = super().to_json(now_ts, deck_id)
        j["type"] = self._type
        return j

class NoteX(genanki.Note):
    def _cloze_cards(self):
        """
        returns a Card with unique ord for each unique cloze reference
        """
        card_ords = set()
        # find cloze replacements in first template's qfmt, e.g "{{cloze::Text}}"
        cloze_replacements = set(re.findall("{{[^}]*?cloze:(?:[^}]?:)*(.+?)}}", self.model.templates[0]['qfmt']) +
                                 re.findall("<%cloze:(.+?)%>", self.model.templates[0]['qfmt']))
        for field_name in cloze_replacements:
          field_index = next((i for i, f in enumerate(self.model.fields) if f['name'] == field_name), -1)
          field_value = self.fields[field_index] if field_index >= 0 else ""
          # update card_ords with each cloze reference N, e.g. "{{cN::...}}"
          card_ords.update([int(m)-1 for m in re.findall("{{c(\d+)::.+?}}", field_value) if int(m) > 0])

        if card_ords == {}:
            card_ords = {0}

        return([genanki.Card(ord) for ord in card_ords])

    @cached_property
    def cards(self):
        if self.model._type == 1:
            return self._cloze_cards()
        else:
            return super().cards


In [52]:
def create_qfmt(content):
    return content + constants.ANKING_QFMT_APPEND

def create_afmt(content):
    return content + constants.ANKING_AFMT_APPEND

In [58]:
AFMT = """
<b>{{TRIAL_NAME}}</b>
<br>
<br>
<i>Bottom Line:</i>
<br>
{{bottom_line}}
<br><br>


<button type="button" class="collapsible"></b>Click for more info:</b></button>
<div class="content">
<br><br>
<i>Wiki Journal Club Link:   </i> <a href="{{URL}}"> {{TRIAL_NAME}} Website </a>
<br><br>
<b> Categories:</b>  {{CATEGORIES}}
<br><hr><br>
<i>Primary Endpoint:</i>
<br>
{{primary_outcome}}
<br><hr><br>
<i>Interventions:</i>
<br>
{{interventions}}
<br><hr><br>
<i>Secondary Outcomes:</i>
<br>
{{secondary_outcomes}}
<br><hr><br>
<i>Criticisms:</i>
<br>
{{criticisms}}
</div>


<style>
.collapsible {
  background-color: #424242;
  color: #C695C6;
  cursor: arrow;
  padding: 1rem;
  width: 20%;
  height: 1rem
  border: none;
  text-align: center;
  outline: none;
  font-size: 2rem;
}

.active, .collapsible:hover {
  background-color: #424242;
}

.content {
  padding: 1rem;
  display: none;
  overflow: hidden;
  background-color: #333B45;
  font-size: .75rem;
}
</style>
</head>

<script>
var coll = document.getElementsByClassName("collapsible");
var i;

for (i = 0; i < coll.length; i++) {
  coll[i].addEventListener("click", function() {
    this.classList.toggle("active");
    var content = this.nextElementSibling;
    if (content.style.display === "block") {
      content.style.display = "none";
    } else {
      content.style.display = "block";
    }
  });
}
</script>


"""

In [59]:
templates = [{'name': 'Card 1',
              'qfmt': create_qfmt("<b>{{TRIAL_NAME}}</b><br><br><u>[ ... ]</u>"),
              'afmt': create_afmt(AFMT),
            }]

fields = [{"name": "TRIAL_NAME"}, {"name": "primary_outcome"},
          {"name": "bottom_line"}, {"name": "URL"}, {"name": "interventions"},
          {"name": "secondary_outcomes"}, {"name": "criticisms"}, {"name": "CATEGORIES"}]

trial_cards_model = ModelX(1371313, "show_teach_trial_model", fields=fields, templates=templates,
                           css=constants.ANKING_CSS)

# Build Deck

In [60]:
from collections import defaultdict
sub_deck_trials = defaultdict(lambda: [])
tags = []
for trial in trials:
    tags += trial.tags
    sub_deck_trials[trial.tags[0]].append(trial)

In [61]:
package_decks = []

sub_deck_counter = 13713131
for sub_deck_name, sub_deck_trial_set in sub_deck_trials.items():
    sub_deck = ga.Deck(sub_deck_counter, f'{sub_deck_name}')
    
    for trial in sub_deck_trial_set:    
        note = NoteX(model=trial_cards_model,
                     fields=[f"{trial.title} Trial:\n",
                             trial.primary_outcome.html_text,
                             trial.bottom_line.html_text,
                             trial.url, trial.interventions.html_text, trial.secondary_outcome.html_text,
                             trial.criticisms.html_text, trial.categories], tags=trial.tags)
        sub_deck.add_note(note)
    package_decks.append(sub_deck)
    sub_deck_counter += 1

In [62]:
package = genanki.Package(package_decks)
package.write_to_file("WJC_clinical_trials.apkg")