In [25]:
import os
import requests
from bs4 import BeautifulSoup
import logging
from typing import Iterator
import boto3
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.graphs.neptune_graph import NeptuneGraph

In [2]:
MEDRXIV_URL = "https://www.medrxiv.org"

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [4]:
class Crawler:
    def __init__(self):
        pass

    @staticmethod
    def _get_last_page(collection_url: str) -> int:
        res = requests.get(collection_url)
        html = res.text
        soup = BeautifulSoup(html, "html.parser")
        uls = soup.find_all("ul", {"class": "pager-items"})
        if uls:
            return int(uls[0].find_all("li")[-1].text)
        else:
            return 0

    @staticmethod
    def _get_pdf_link_from_href(href) -> str:
        article_href = href.get("href")
        return f"{MEDRXIV_URL}{article_href}"

    @staticmethod
    def get_urls_for_collection(collection: str) -> Iterator[str]:
        collection_url = f"{MEDRXIV_URL}/collection/{collection}"
        logger.warn("Crawling %s ...", collection_url)
        last_page = Crawler._get_last_page(collection_url)
        logger.warn("There are %d pages in this collection", last_page)

        for page in range(0, last_page):
            url = collection_url if page == 0 else f"{collection_url}?page={page}"
            logger.warn("Crawling page %d of %d. Url: %s ...", page, last_page, url)
            res = requests.get(url)
            html = res.text
            soup = BeautifulSoup(html, "html.parser")

            links = soup.find_all("a", {"class": "highwire-cite-linked-title"})
            for link in links:
                full_article_url = Crawler._get_pdf_link_from_href(link)
                pdf_url = f"{full_article_url}.full.pdf"
                logger.warn("Found article: %s", pdf_url)
                yield pdf_url

In [6]:
s3_client = boto3.client('s3')

In [None]:
local_folder = "/home/studio-lab-user/sagemaker-studiolab-notebooks/bionodes_docs"
collection_name = "Epidemiology"
urls = Crawler.get_urls_for_collection(collection_name)
for url in urls:
    filename = url.split("/")[-1]
    local_file_path = os.path.join(local_folder, filename)
    response = requests.get(url)
    with open(local_file_path, "wb") as file:
        file.write(response.content)
        s3_client.upload_file(local_file_path, "bionodes", f"{collection_name}/{filename}")

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [None]:
pathlist = Path(local_folder).glob('**/*.pdf')
chunks = []
for path in pathlist:
    path_in_str = str(path)   
    loader = PyMuPDFLoader(path_in_str)
    data = loader.load()
    
    for page in data:
        page_chunks = text_splitter.split_text(data[0].page_content)
        chunks.extend(page_chunks)

In [29]:
neptune_client = boto3.client('neptune','us-east-1')

In [30]:
graph = NeptuneGraph(
    host='db-neptune-1.cluster-comdbuopgbbk.us-east-1.neptune.amazonaws.com',
    port=8182,
    client=neptune_client
)

NeptuneQueryException: {'message': 'Could not get schema for Neptune database', 'detail': '{\'message\': \'Summary API is not available for this instance of Neptune,ensure the engine version is >=1.2.1.0\', \'details\': "\'Neptune\' object has no attribute \'get_propertygraph_summary\'"}'}

In [24]:
response = client.execute_open_cypher_query(
    openCypherQuery='',
    parameters='string'
)

AttributeError: 'NeptuneData' object has no attribute 'describe_db_clusters'

In [None]:
# Extract keywords for each article
# Create graph, 