In [9]:
import os
import requests
from bs4 import BeautifulSoup
import logging
from typing import Iterator

In [3]:
MEDRXIV_URL = "https://www.medrxiv.org"

In [4]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [7]:
class Crawler:
    def __init__(self):
        pass

    @staticmethod
    def _get_last_page(collection_url: str) -> int:
        res = requests.get(collection_url)
        html = res.text
        soup = BeautifulSoup(html, "html.parser")
        uls = soup.find_all("ul", {"class": "pager-items"})
        if uls:
            return int(uls[0].find_all("li")[-1].text)
        else:
            return 0

    @staticmethod
    def _get_pdf_link_from_href(href) -> str:
        article_href = href.get("href")
        return f"{MEDRXIV_URL}{article_href}"

    @staticmethod
    def get_urls_for_collection(collection: str) -> Iterator[str]:
        collection_url = f"{MEDRXIV_URL}/collection/{collection}"
        logger.warn("Crawling %s ...", collection_url)
        last_page = Crawler._get_last_page(collection_url)
        logger.warn("There are %d pages in this collection", last_page)

        for page in range(0, last_page):
            url = collection_url if page == 0 else f"{collection_url}?page={page}"
            logger.warn("Crawling page %d of %d. Url: %s ...", page, last_page, url)
            res = requests.get(url)
            html = res.text
            soup = BeautifulSoup(html, "html.parser")

            links = soup.find_all("a", {"class": "highwire-cite-linked-title"})
            for link in links:
                full_article_url = Crawler._get_pdf_link_from_href(link)
                pdf_url = f"{full_article_url}.full.pdf"
                logger.warn("Found article: %s", pdf_url)
                yield pdf_url

In [11]:
local_folder = "/home/studio-lab-user/sagemaker-studiolab-notebooks/bionodes_docs"
urls = Crawler.get_urls_for_collection("Epidemiology")
for url in urls:
    filename = url.split("/")[-1]
    local_file_path = os.path.join(local_folder, filename)
    response = requests.get(url)
    with open(local_file_path, "wb") as file:
        file.write(response.content)

  logger.warn("Crawling %s ...", collection_url)
Crawling https://www.medrxiv.org/collection/Epidemiology ...
  logger.warn("There are %d pages in this collection", last_page)
There are 869 pages in this collection
  logger.warn("Crawling page %d of %d. Url: %s ...", page, last_page, url)
Crawling page 0 of 869. Url: https://www.medrxiv.org/collection/Epidemiology ...
  logger.warn("Found article: %s", pdf_url)
Found article: https://www.medrxiv.org/content/10.1101/2024.06.04.24308398v1.full.pdf
  logger.warn("Found article: %s", pdf_url)
Found article: https://www.medrxiv.org/content/10.1101/2024.06.04.24308415v1.full.pdf
  logger.warn("Found article: %s", pdf_url)
Found article: https://www.medrxiv.org/content/10.1101/2024.06.04.24308411v1.full.pdf
  logger.warn("Found article: %s", pdf_url)
Found article: https://www.medrxiv.org/content/10.1101/2024.06.03.24308369v1.full.pdf
  logger.warn("Found article: %s", pdf_url)
Found article: https://www.medrxiv.org/content/10.1101/2024.05.31

KeyboardInterrupt: 

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("example_data/layout-parser-paper.pdf")
data = loader.load()

data[0]
data[0]["page_content"]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

chunks = []
for page in data:
    page_chunks = text_splitter.split_text(page["page_content"])
    chunks.extend(page_chunks)