In [None]:
#default_exp download

# Downloading PDFs for Roll Call votes
> The roll call votes of the plenary of the EU parliament are stored freely available online. So let's go and collect them with `requests` and `bs4`.

In [None]:
#hide
%load_ext autoreload
%autoreload 2

TODO:
* enable downloading to a folder with a timestamp (prevents that information is lost in the case that the file behind the same URL is actually changed over time)

In [None]:
#export
from pathlib import Path
import requests
import re
from bs4 import BeautifulSoup
import typing
import time
import tqdm
import sys
from loguru import logger

In [None]:
#hide
logger.add(sys.stderr, level='DEBUG')

Defining the site where everything is stored

In [None]:
#export
URL = 'https://www.europarl.europa.eu/plenary/en/votes.html?tab=votes'
PDF_PATH = Path("../pdfs")

Identifying the links

In [None]:
#export
def identify_links_for_pdfs(url:str, bs_parser:str='lxml'):
    'There are RCV (Roll Call Vote) and VOT PDF files. Links for both are extracted.'
    
    with requests.Session() as s:
        res = s.get(url)

    soup = BeautifulSoup(res.text, features=bs_parser)

    elements_with_pdf = soup.find_all(name='a', attrs={'class':'link_pdf'})
    pattern = re.compile(r'RCV\w*\.pdf')
    
    rcv_pdfs = [element['href'] for element in elements_with_pdf if pattern.search(element['href'])]
    vot_pdfs = [element['href'] for element in elements_with_pdf if pattern.search(element['href']) is None]
    
    return rcv_pdfs, vot_pdfs

In [None]:
%%time
rcv_pdfs, vot_pdfs = identify_links_for_pdfs(URL)

In [None]:
#hide
assert len(rcv_pdfs) == len(vot_pdfs)
assert len(rcv_pdfs) > 0
assert all([f.endswith('pdf') for f in rcv_pdfs])

In [None]:
#export
def download_file(link:str, file_dir:Path=PDF_PATH):
    'Given a valid URL a file is downloaded.'
    
    filename = link.split('/')[-1]
    
    r = requests.get(link, allow_redirects=True)
    
    logger.debug(f'Writing to {file_dir/filename}')
    with open(file_dir/filename, 'wb') as f:
        f.write(r.content)

In [None]:
%%time
link = rcv_pdfs[0]
download_file(link)

Collecting all files

In [None]:
#export
def collect_multiple_files(links:typing.List[str], file_dir:Path=PDF_PATH, dt:float=.5):
    for link in tqdm.tqdm(links, total=len(links), desc='File'):
        time.sleep(dt)
        download_file(link, file_dir)

In [None]:
%%time
collect_multiple_files(rcv_pdfs, dt=.01)