In [1]:
from pdfminer import high_level
import requests
from io import BytesIO, SEEK_SET, SEEK_END
import re
from bs4 import BeautifulSoup

In [2]:
class ResponseStream(object):
    def __init__(self, request_iterator):
        self._bytes = BytesIO()
        self._iterator = request_iterator

    def _load_all(self):
        self._bytes.seek(0, SEEK_END)
        for chunk in self._iterator:
            self._bytes.write(chunk)

    def _load_until(self, goal_position):
        current_position = self._bytes.seek(0, SEEK_END)
        while current_position < goal_position:
            try:
                current_position = self._bytes.write(next(self._iterator))
            except StopIteration:
                break

    def tell(self):
        return self._bytes.tell()

    def read(self, size=None):
        left_off_at = self._bytes.tell()
        if size is None:
            self._load_all()
        else:
            goal_position = left_off_at + size
            self._load_until(goal_position)

        self._bytes.seek(left_off_at)
        return self._bytes.read(size)
    
    def seek(self, position, whence=SEEK_SET):
        if whence == SEEK_END:
            self._load_all()
        else:
            self._bytes.seek(position, whence)

In [3]:
def extract_raw_text(url):
    response = requests.get(url, stream=True)
    stream = ResponseStream(response.iter_content(64))
    return high_level.extract_text(stream, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None)  

In [4]:
content = extract_raw_text("https://www.medrxiv.org/content/10.1101/2020.04.05.20053884v2.full.pdf")

In [5]:
keywords = [keyword.strip() for keyword in re.findall("(?<=Keywords:)(.*)(?=\\n)", content)[0].split(";") if keyword.strip()]

In [6]:
keywords

['SARS-CoV-2',
 'COVID-19',
 'Coronavirus',
 'Epidemiological parameters',
 'Interventions']

In [7]:
res = requests.get('https://www.medrxiv.org/collection/epidemiology?page=1')
html = res.text
soup = BeautifulSoup(html, 'html.parser')

In [8]:
ul = soup.find_all("ul", {"class":"pager-items"})[0]
int(ul.find_all("li")[-1].text)

79

In [9]:
for link in soup.find_all("a", {"class":"highwire-cite-linked-title"}):
    # print(link.get("href"))
    # print(link.get("span"))
    print(link.find('span').text)

Model calibration, nowcasting, and operational prediction of the COVID-19 pandemic
Countries should aim to lower the reproduction number R close to 1.0 for the short-term mitigation of COVID-19 outbreaks
The Easter and Passover Blip in New York City
State-level variation of initial COVID-19 dynamics in the United States: The role of local government interventions
Estimate of COVID-19 case prevalence in India based on surveillance data of patients with severe acute respiratory illness
The Longevity-Frailty Hypothesis: Evidence from COVID-19 Death Rates in Europe
Estimation of Tunisia COVID-19 infected cases based on mortality rate
Using Feedback on Symptomatic Infections to Contain the Coronavirus Epidemic: Insight from a SPIR Model
Nature of transmission of Covid19 in India
Outbreak dynamics of COVID-19 in China and the United States
