In [1]:
%load_ext autoreload
%autoreload 2

from tqdm.notebook import tqdm

import genanki
import webscrape as ws
import anki_utils as au

# Scrape Images

In [None]:
url = "https://www.radiologymasterclass.co.uk/gallery/galleries"

overwrite = True
if overwrite:
    deck_cards = ws.get_deck_cards(url)

Scraping #radiology::chest_x-ray::anatomical_variants URLs:   0%|          | 0/10 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::airways_and_lung_collapse URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::devices_and_artifacts URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::pneumothorax_gallery URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::lung_cancer URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::mediastinum_and_hilum URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::pulmonary_disease URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::cardiac_disease URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::chest_x-ray::quality URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::ct_brain::gallery_1 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::ct_brain::gallery_2::scroll_images URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::upper_limb::gallery_1 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::upper_limb::gallery_2 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::lower_limb::gallery_1 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::lower_limb::gallery_2 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::axial_skeleton::gallery_1 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::trauma_x-ray::axial_skeleton::gallery_2 URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::abdominal_x-ray::abnormalities URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::abdominal_x-ray::artifacts URLs:   0%|          | 0/12 [00:00<?, ?url/s]

Scraping #radiology::abdominal_x-ray::calcification URLs:   0%|          | 0/12 [00:00<?, ?url/s]

# Card Format

In [None]:
BASE_QFMT = "<div style='font-family: Arial; font-size: 20px; color: red; text-align: center'>{content}</div>"
BASE_AFMT = "<div style='font-family: Arial; font-size: 20px; color: magenta; text-align: center'>{content}</div>"

def create_qfmt(content):
    return BASE_QFMT.format(content=content)

def create_afmt(content):
    return BASE_AFMT.format(content=content)

In [None]:
templates = [{'name': 'Card 1',
              'qfmt': create_qfmt("Description/Diagnosis?<br>{{cloze:Question}}<br>{{show_image}}"),
              'afmt': create_afmt('{{Answer}}<br>{{show_image}}   {{teach_image}}<br>{{Description}}'),
            }]

fields = [{"name": "Question"}, {"name": "Answer"}, {"name": "Description"},
          {"name": "show_image"}, {"name": "teach_image"}]

radiology_model = ModelX(1371312, "show_teach_image_model", fields=fields, templates=templates)

# Build Deck 

In [383]:
radiology_deck = ga.Deck(13713121, 'radiology_images')

model = radiology_model
deck = radiology_deck
media_files = []

names = {}

for i, card in tqdm(enumerate(deck_cards), desc="Creating Cards", unit=" Cards", total=len(deck_cards)):
    if card.name in names:
        names[card.name] += 1
        card.name = card.name + f"_{names[card.name]}"
    else:
        names[card.name] = 1
    card.save_name = f"{i:003}_" + card.name.lower().replace(" ", "_").replace("/", "_")
    card.show_image_path = f"images/{card.save_name}_show_image.jpg"
    card.teach_image_path = f"images/{card.save_name}_teach_image.jpg"
    card.image_paths = [card.show_image_path, card.teach_image_path]

    card.show_image.save(card.show_image_path)
    card.teach_image.save(card.teach_image_path)

    note = NoteX(model=radiology_model,
                   fields=[f"Question: Description and Diagnosis? {{c1::{card.name}}}", card.name, card.desc,
                           f'<img src="{os.path.basename(card.show_image_path)}">',
                           f'<img src="{os.path.basename(card.teach_image_path)}">'],
                  tags=card.tags)
    deck.add_note(note)
    media_files += card.image_paths

Creating Cards:   0%|          | 0/528 [00:00<?, ? Cards/s]

# Save Deck

In [384]:
radiology_package = genanki.Package(radiology_deck)
radiology_package.media_files = media_files

radiology_package.write_to_file('radiology_images.apkg')



In [1]:
import io
import time
import requests
import re
from PIL import Image
from io import BytesIO

from bs4 import BeautifulSoup as bsoup
from tqdm.notebook import tqdm

In [2]:
REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)'
    ' Chrome/39.0.2171.95 Safari/537.36'
}
ANTI_DOS_DELAY = 0.1

In [3]:
def safe_request(url: str, headers: dict = REQUEST_HEADERS,
                 anti_dos_delay: float = ANTI_DOS_DELAY):
    """"""
    time.sleep(anti_dos_delay)
    try:
        data = requests.get(url, headers=headers)
        return data
    except requests.exceptions.ConnectionError as e:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            return requests.get(url, headers=headers, verify=False)
        
def get_html(url: str):
    """"""
    request_content = safe_request(url)
    return bsoup(request_content.content)

In [363]:
def str_remover(string: str, *tokens):
    """"""
    for token in tokens:
        string = string.replace(token, "")
    return string

def get_box_legends(html):
    """"""
    return html.find_all(attrs={"class":"boxlegend"})

def process_box_legend(boxlegend_html):
    """"""
    description = str(boxlegend_html).split('boxlegend">')[1]#.replace("<li>", "\n")
#     return str_remover(description, "li>", "strong>", "hr>", "ul>", "h4>", "figcaption>", "</", "<")
    return description

def get_image_urls(html):
    """"""
    return re.findall("\((.*?)\)", html.find('style').text)

def get_images_from_url(url: str):
    """"""
    response = safe_request(url)
    img = Image.open(BytesIO(response.content))
    width, height = img.size
    show_image = img.crop(box=(0, 0, width // 2, height))
    teach_image = img.crop(box=(width // 2, 0, width, height))
    return show_image, teach_image

In [364]:
def get_gallery_image_links(gallery_url: str):
    """"""
    gallery_html = get_html(gallery_url)
    gallery_elements = gallery_html.find(attrs={"id": "breadcrumb"}).findAll('a')[2:]
    gallery_links = [gallery_element.attrs["href"] for gallery_element in gallery_elements]
    return [gallery_url] + gallery_links

def get_gallery_info(gallery, tag_prefix):
    """"""
    tags = [tag_prefix + "::" + gallery.contents[0].replace(" ", "_").lower().replace("_-_", "::")]
    url = gallery.attrs["href"]
    links = get_gallery_image_links(url)
    return tags, links

In [365]:
class ProtoCardData:
    def __init__(self, desc: str, image_url: str):
        self.desc = desc.replace("<li>", "<br> &#x2022; ").replace("</li>", "")
        self.name = desc.split("</h4>")[0].replace("<h4>", "")
        self.image_url = image_url
        self.show_image, self.teach_image = get_images_from_url(image_url)
        self.tags = []
        
    def add_tags(self, tags):
        self.tags = self.tags + list(tags)
        
    def __repr__(self):
        s = self.desc.split("\n")[0]
        s += f":\n{self.image_url}\n"
        return s

def scrape_card_info(url: str, pbar_desc: str = ""):
    """"""
    
    if isinstance(url, (tuple, list)):
        pbar =  tqdm(url, unit="url", desc=f"Scraping {pbar_desc} URLs")
        return [card for url_i in pbar for card in scrape_card_info(url_i)]
        
    request_content = safe_request(url)
    url_html = bsoup(request_content.content)

    box_legends = get_box_legends(url_html)
    descriptions = [process_box_legend(box_legend) for box_legend in box_legends]

    image_urls = get_image_urls(url_html)
    
    return [ProtoCardData(desc, image_url) for desc, image_url in zip(descriptions, image_urls)]

In [376]:
def get_deck_cards(url):
    """"""
    base_html = get_html(url)

    deck_cards = []
    deck_tag = "#radiology"
    topics = base_html.find_all(attrs={"class": "gallinfo"})
    for topic in topics:
        galleries = topic.findAll('a')
        for gallery in galleries:
            gallery_tags, gallery_links = get_gallery_info(gallery, tag_prefix=deck_tag)
            gallery_cards = scrape_card_info(gallery_links, pbar_desc=gallery_tags[0])

            for gallery_card in gallery_cards:
                gallery_card.add_tags(gallery_tags)

            deck_cards += gallery_cards

    return deck_cards

In [385]:
url = "https://www.radiologymasterclass.co.uk/gallery/galleries"

overwrite = False
if overwrite:
    deck_cards = get_deck_cards(url)

# Building the Deck

In [379]:
import genanki
from cached_property import cached_property
import re

class ModelX(genanki.Model):
    def __init__(self, model_id=None, name=None, fields=None, templates=None, css='', type=0):
        super().__init__(model_id, name, fields, templates, css)
        self._type = type

    def to_json(self, now_ts, deck_id):
        j = super().to_json(now_ts, deck_id)
        j["type"] = self._type
        return j

class NoteX(genanki.Note):
    def _cloze_cards(self):
        """
        returns a Card with unique ord for each unique cloze reference
        """
        card_ords = set()
        # find cloze replacements in first template's qfmt, e.g "{{cloze::Text}}"
        cloze_replacements = set(re.findall("{{[^}]*?cloze:(?:[^}]?:)*(.+?)}}", self.model.templates[0]['qfmt']) +
                                 re.findall("<%cloze:(.+?)%>", self.model.templates[0]['qfmt']))
        for field_name in cloze_replacements:
          field_index = next((i for i, f in enumerate(self.model.fields) if f['name'] == field_name), -1)
          field_value = self.fields[field_index] if field_index >= 0 else ""
          # update card_ords with each cloze reference N, e.g. "{{cN::...}}"
          card_ords.update([int(m)-1 for m in re.findall("{{c(\d+)::.+?}}", field_value) if int(m) > 0])

        if card_ords == {}:
            card_ords = {0}

        return([genanki.Card(ord) for ord in card_ords])

    @cached_property
    def cards(self):
        if self.model._type == 1:
            return self._cloze_cards()
        else:
            return super().cards


In [380]:
import genanki
import genanki as ga

import os

In [381]:
BASE_QFMT = "<div style='font-family: Arial; font-size: 20px; color: red; text-align: center'>{content}</div>"
BASE_AFMT = "<div style='font-family: Arial; font-size: 20px; color: magenta; text-align: center'>{content}</div>"

def create_qfmt(content):
    return BASE_QFMT.format(content=content)

def create_afmt(content):
    return BASE_AFMT.format(content=content)

In [382]:
templates = [{'name': 'Card 1',
              'qfmt': create_qfmt("Description/Diagnosis?<br>{{cloze:Question}}<br>{{show_image}}"),
              'afmt': create_afmt('{{Answer}}<br>{{show_image}}   {{teach_image}}<br>{{Description}}'),
            }]

fields = [{"name": "Question"}, {"name": "Answer"}, {"name": "Description"},
          {"name": "show_image"}, {"name": "teach_image"}]

radiology_model = ModelX(1371312, "show_teach_image_model", fields=fields, templates=templates)

In [383]:
radiology_deck = ga.Deck(13713121, 'radiology_images')

model = radiology_model
deck = radiology_deck
media_files = []

names = {}

for i, card in tqdm(enumerate(deck_cards), desc="Creating Cards", unit=" Cards", total=len(deck_cards)):
    if card.name in names:
        names[card.name] += 1
        card.name = card.name + f"_{names[card.name]}"
    else:
        names[card.name] = 1
    card.save_name = f"{i:003}_" + card.name.lower().replace(" ", "_").replace("/", "_")
    card.show_image_path = f"images/{card.save_name}_show_image.jpg"
    card.teach_image_path = f"images/{card.save_name}_teach_image.jpg"
    card.image_paths = [card.show_image_path, card.teach_image_path]

    card.show_image.save(card.show_image_path)
    card.teach_image.save(card.teach_image_path)

    note = NoteX(model=radiology_model,
                   fields=[f"Question: Description and Diagnosis? {{c1::{card.name}}}", card.name, card.desc,
                           f'<img src="{os.path.basename(card.show_image_path)}">',
                           f'<img src="{os.path.basename(card.teach_image_path)}">'],
                  tags=card.tags)
    deck.add_note(note)
    media_files += card.image_paths

Creating Cards:   0%|          | 0/528 [00:00<?, ? Cards/s]

In [384]:
radiology_package = genanki.Package(radiology_deck)
radiology_package.media_files = media_files

radiology_package.write_to_file('radiology_images.apkg')

