In [10]:
import re

import requests
from lxml import html

from slugify import slugify

import pandas as pd

In [2]:
YEAR = 2016

In [5]:
BASE_URL = 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
CAPTCHA_URL = 'https://publicapps.agriculture.gov.ie/capben/captchaservlet'
CHOOSE_URL = 'https://publicapps.agriculture.gov.ie/capben/loadyear.do'
SEARCH_URL = 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
DOSEARCH_URL = 'https://publicapps.agriculture.gov.ie/capben/searchcurrent.do'
RECIPIENT_URL = 'https://publicapps.agriculture.gov.ie/capben/cb_results_current_year.jsp?&RecNo={}'

In [7]:
def get_session():
    session = requests.Session()
    session.get(BASE_URL)
    session.post(CAPTCHA_URL, data={'captcha-input': 'CKDPKY', 'captcha-inputHash': '-1449362229'}, headers={
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Referer': 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
    })
    session.get(CHOOSE_URL)

    return session

In [27]:
def search_year(year):
    session = get_session()
    response = session.post(SEARCH_URL, data={
        'Year': str(year),
        'searchYear': str(year),
        'PageFrom': 'Year',
        }, headers={
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': CHOOSE_URL
        }
    )
    root = html.fromstring(response.text)
    locations = root.xpath('//select[@name="location"]/option/@value')
    locations = [l for l in locations if l != '%']
    loc_index = 0
    while loc_index < len(locations):
        location = locations[loc_index]
        try:
            print('Searching', year, location)
            yield search_year_location(session, year, location)
            loc_index += 1
        except Exception as e:
            print(e)
            session = get_session()

RESULT_RE = re.compile('Viewing results \d+ to \d+ of (\d+)')    

def do_search(session, year, location):
    post = {
        'searchYear': str(year),
        'location': location,
        'surname': '',
        'forename': '',
        'address': '',
        'range': '6',
        'measure': '99',
        'sort': '3',
    }
    response = session.post(DOSEARCH_URL, data=post)
    root = html.fromstring(response.text)
    span = root.xpath('//span[@class="view_head"]')[0]
    match = RESULT_RE.search(span.text_content())
    total_results = int(match.group(1))
    return total_results


def search_year_location(session, year, location):
    total_results = do_search(session,  year, location)
    num = 1
    while num <= total_results: 
        print('%.2f%%\r' % (num / total_results * 100), end='')
        try:
            yield from get_result(session, year, num)
            num += 1
        except Exception as e:
            print(e)
            session = get_session()
            do_search(session,  year, location)
    
def get_result(session, year, result_no):
    response = session.get(RECIPIENT_URL.format(result_no))
    root = html.fromstring(response.text)
    details = root.xpath('.//div[starts-with(@class, "results-detail-table")]')[0]
    recipient_name = details.xpath('./div[1]/div[2]')[0].text_content()
    recipient_location = details.xpath('./div[2]/div[2]')[0].text_content()
    try:
        int(recipient_name)
        recipient_id = '%s-%s' % (year, recipient_name)
        recipient_name = None
    except ValueError:
        recipient_id = '%s-%s' % (slugify(recipient_name), slugify(recipient_location))
    trs = root.xpath('.//div[@class="results-detail-table-content"]/table//tr[position() > 1]')
    for tr in trs:
        tds = tr.xpath('./td')
        if len(tds) != 3:
            continue
        amount = tds[1].text_content()
        if not amount:
            continue
        amount = float(amount.replace('€', '').replace(',', '').strip())
        scheme = tds[0].text_content().strip()
        yield {
            'recipient_id': recipient_id,
            'recipient_name': recipient_name,
            'recipient_location': recipient_location,
            'scheme': scheme,
            'amount': amount,
            'year': year,
            'currency': 'EUR',
            'country': 'IE'
        }

In [28]:
df = pd.DataFrame()
for gen in search_year(YEAR):
    df = pd.concat([df, pd.DataFrame(gen)])
    df.to_csv('ie_%s.csv.gz' % YEAR, compression='gzip', index=False)

Searching 2016 CARLOW
Searching 2016 CAVAN
Searching 2016 CLARE
Searching 2016 CORK
Searching 2016 DONEGAL
Searching 2016 DUBLIN
Searching 2016 GALWAY
Searching 2016 KERRY
Searching 2016 KILDARE
Searching 2016 KILKENNY
Searching 2016 LAOIS
Searching 2016 LEITRIM
Searching 2016 LIMERICK
Searching 2016 LONGFORD
Searching 2016 LOUTH
Searching 2016 MAYO
Searching 2016 MEATH
Searching 2016 MONAGHAN
Searching 2016 NORTHERN IRELAND
Searching 2016 OFFALY
Searching 2016 OUTSIDE IRELAND
Searching 2016 ROSCOMMON
Searching 2016 SLIGO
Searching 2016 TIPPERARY
Searching 2016 WATERFORD
Searching 2016 WESTMEATH
Searching 2016 WEXFORD
Searching 2016 WICKLOW
100.00%