#### Requirements:

- requests
- beautifulsoup4
- pdfminer.six
- dask
- distributed

In [1]:
import os
import StringIO
from collections import Counter

import requests
from bs4 import BeautifulSoup

import dask.bag as db
from dask.distributed import Client
from distributed.diagnostics import progress
from pdfminer.high_level import extract_text_to_fp as extract

# Dowload docs

In [2]:
resp = requests.get('http://www.mek.oszk.hu/export/mek2excel.htm')
soup = BeautifulSoup(resp.content, 'html.parser')
table = soup.find('table')

### Get most common formats

In [3]:
formats = []
for tr in table.findAll('tr'):
    tds = tr.findAll('td')
    if len(tds) > 1:
        for f in tds[-2].text.strip().split('|'):
            if f: 
                formats.append(f.strip())

Counter(formats).most_common(5)

[(u'PDF', 15049),
 (u'HTML', 6288),
 (u'RTF', 5173),
 (u'PVU', 2430),
 (u'WORD 8.0', 2066)]

### Download docs in most common format

In [3]:
def download_pdfs_as_txt(link):
    filename = link.split('/')[-2]
    path = './docs/' + filename + '.txt'
    if os.path.isfile(path) and os.path.getsize(path) > 0:
        return 1
    
    response = requests.get(link + filename + '.pdf')
    if not response.status_code == 200: 
        return 0
    
    pdf = response.content
    instr = StringIO.StringIO(pdf)

    with open(path, 'w') as out:
        try:
            extract(instr, out, codec='utf-8')
        except:
            return 0
    return 1

In [5]:
links = db.from_sequence([a.get('href') for a in table.findAll('a')], partition_size=50)

with Client() as client:    
    docs = links.map(download_pdfs_as_txt)
    future = client.compute(docs)
    progress(future)
    results = future.result()

In [6]:
print '{}/{} - {:.2f}%'.format(sum(results), len(results), sum(results) / float(len(results)) * 100)

12713/16675 - 76.24%
