In [1]:
import pdfminer
from pdfminer import high_level
import requests
from io import BytesIO, SEEK_SET, SEEK_END
import re
from bs4 import BeautifulSoup

In [2]:
class ResponseStream(object):
    def __init__(self, request_iterator):
        self._bytes = BytesIO()
        self._iterator = request_iterator

    def _load_all(self):
        self._bytes.seek(0, SEEK_END)
        for chunk in self._iterator:
            self._bytes.write(chunk)

    def _load_until(self, goal_position):
        current_position = self._bytes.seek(0, SEEK_END)
        while current_position < goal_position:
            try:
                current_position = self._bytes.write(next(self._iterator))
            except StopIteration:
                break

    def tell(self):
        return self._bytes.tell()

    def read(self, size=None):
        left_off_at = self._bytes.tell()
        if size is None:
            self._load_all()
        else:
            goal_position = left_off_at + size
            self._load_until(goal_position)

        self._bytes.seek(left_off_at)
        return self._bytes.read(size)
    
    def seek(self, position, whence=SEEK_SET):
        if whence == SEEK_END:
            self._load_all()
        else:
            self._bytes.seek(position, whence)

In [3]:
def extract_raw_text(url):
    response = requests.get(url, stream=True)
    stream = ResponseStream(response.iter_content(64))
    return high_level.extract_text(stream, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None)  

In [4]:
content = extract_raw_text("https://www.medrxiv.org/content/10.1101/2020.04.05.20053884v2.full.pdf")

In [5]:
keywords = [keyword.strip() for keyword in re.findall("(?<=Keywords:)(.*)(?=\\n)", content)[0].split(";") if keyword.strip()]

In [6]:
keywords

['SARS-CoV-2',
 'COVID-19',
 'Coronavirus',
 'Epidemiological parameters',
 'Interventions']

In [7]:
res = requests.get('https://www.medrxiv.org/collection/epidemiology?page=1')
html = res.text
soup = BeautifulSoup(html, 'html.parser')

In [12]:
for link in soup.find_all("a", {"class":"highwire-cite-linked-title"}):
    # print(link.get("href"))
    # print(link.get("span"))
    print(link.find('span').text)

The December 2019 New Corona Virus Disease (SARS-CoV-2) Outbreak: A Behavioral Infectious Disease Policy Model
The Effect of Stay-at-Home Orders on COVID-19 Infections in the United States
Using ICU data to improve the real-time estimation of the effective reproductive number of the COVID-19 epidemic in 9 European countries
Estimating the Fraction of Unreported Infections in Epidemics with a Known Epicenter: an Application to COVID-19
SimCOVID: An Open-Source Simulation Program for the COVID-19 Outbreak
Analysis of COVID-19 spread in South Korea using the SIR model with time-dependent parameters and deep learning
RISK ANALYSIS AND PREDICTION FOR COVID19 DEMOGRAPHICS IN LOW RESOURCE SETTINGS USING A PYTHON DESKTOP APP AND EXCEL MODELS.
A Predictive Model for the Evolution of COVID-19
Analysis of the COVID-19 epidemic in french overseas department Mayotte based on a modified deterministic and stochastic SEIR model
MARKOVIAN RANDOM WALK MODELING AND VISUALIZATION OF THE EPIDEMIC SPREAD OF