In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
import time
import random

def be_nice():
    """Adds a random sleep between 0s and 1s"""
    time.sleep(random.random())

In [3]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [4]:
start_url = 'https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset={}'
def crawl_pages():
    """Crawls all pages of the archives and saves them in dumps/html"""
    offset = 0
    while True:
        current_url = start_url.format(offset)
        print('Requesting {}'.format(current_url))
        r = requests.get(current_url)
        r.raise_for_status()
        with open('./dumps/html/{}.html'.format(offset), 'wb') as f:
            f.write(r.content)
        offset += 10
        be_nice()

In [5]:
crawl_pages()

Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=0
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=10
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=20
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=30
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=40
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limit=10000&noFilterSet=true&offset=50
Requesting https://www.bundestag.de/ajax/filterlist/de/dokumente/-/442146/h_bcc811783a1be39391c6f3248151306d?limi

KeyboardInterrupt: 

In [24]:
import glob

def extract_pdf_links():
    dumped_html = glob.glob('./dumps/html/*')
    for file in log_progress(dumped_html, every=1):
        with open(file, 'r') as f:
            soup = BeautifulSoup(f, 'lxml')
            for anchor in soup.select('a[href$=".pdf"]'):
                yield anchor['href']

def download_all(url_iterator, path):
    for url in log_progress(url_iterator, every=1):
        r = requests.get(url, stream=True)
        try:
            r.raise_for_status()
        except:
            print('URL {}'.format(url))
            print('Status {}'.format(r.status_code))
            continue
        filename = url.split('/')[-1]
        f = open(path.strip('/') + '/' + filename, 'wb')
        for chunk in r.iter_content(chunk_size=512 * 1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
        f.close()
        be_nice()

In [28]:
for u in log_progress(download_all(extract_pdf_links(), './dumps/pdf'), every=1):
    pass

KeyboardInterrupt: 