In [1]:
import re

import requests
from lxml import html

from slugify import slugify

import pandas as pd

In [2]:
YEAR = 2017

In [3]:
BASE_URL = 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
CAPTCHA_URL = 'https://publicapps.agriculture.gov.ie/capben/captchaservlet'
CHOOSE_URL = 'https://publicapps.agriculture.gov.ie/capben/loadyear.do'
SEARCH_URL = 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
DOSEARCH_URL = 'https://publicapps.agriculture.gov.ie/capben/searchcurrent.do'
RECIPIENT_URL = 'https://publicapps.agriculture.gov.ie/capben/cb_results_current_year.jsp?&RecNo={}'

In [4]:
def get_session():
    session = requests.Session()
    session.get(BASE_URL)
    session.post(CAPTCHA_URL, data={'captcha-input': 'CKDPKY', 'captcha-inputHash': '-1449362229'}, headers={
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Referer': 'https://publicapps.agriculture.gov.ie/capben/loadsearch.do'
    })
    session.get(CHOOSE_URL)

    return session

In [7]:
def search_year(year):
    session = get_session()
    response = session.post(SEARCH_URL, data={
        'Year': str(year),
        'searchYear': str(year),
        'PageFrom': 'Year',
        }, headers={
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': CHOOSE_URL
        }
    )
    root = html.fromstring(response.text)
    locations = root.xpath('//select[@name="location"]/option/@value')
    locations = [l for l in locations if l != '%']
    loc_index = 0
    while loc_index < len(locations):
        location = locations[loc_index]
        try:
            print('Searching', year, location)
            yield search_year_location(session, year, location)
            loc_index += 1
        except Exception as e:
            print(e)
            session = get_session()

RESULT_RE = re.compile('Viewing results \d+ to \d+ of (\d+)')    

def do_search(session, year, location):
    post = {
        'searchYear': str(year),
        'location': location,
        'surname': '',
        'forename': '',
        'address': '',
        'range': '6',
        'measure': 'All Measures (default)',
        'sort': '3',
    }
    response = session.post(DOSEARCH_URL, data=post)
    root = html.fromstring(response.text)
    span = root.xpath('//span[@class="view_head"]')[0]
    match = RESULT_RE.search(span.text_content())
    total_results = int(match.group(1))
    return total_results


def search_year_location(session, year, location):
    total_results = do_search(session,  year, location)
    num = 1
    while num <= total_results: 
        print('%.2f%%\r' % (num / total_results * 100), end='')
        try:
            yield from get_result(session, year, num)
            num += 1
        except Exception as e:
            print(e)
            session = get_session()
            do_search(session,  year, location)
    
def get_result(session, year, result_no):
    response = session.get(RECIPIENT_URL.format(result_no))
    root = html.fromstring(response.text)
    details = root.xpath('.//div[starts-with(@class, "results-detail-table")]')[0]
    recipient_name = details.xpath('./div[1]/div[2]')[0].text_content()
    recipient_location = details.xpath('./div[2]/div[2]')[0].text_content()
    try:
        int(recipient_name)
        recipient_id = '%s-%s' % (year, recipient_name)
        recipient_name = None
    except ValueError:
        recipient_id = '%s-%s' % (slugify(recipient_name), slugify(recipient_location))
    trs = root.xpath('.//div[@class="results-detail-table-content"]/table//tr[position() > 1]')
    for tr in trs:
        tds = tr.xpath('./td')
        if len(tds) != 3:
            continue
        amount = tds[1].text_content()
        if not amount:
            continue
        amount = float(amount.replace('€', '').replace(',', '').strip())
        scheme = tds[0].text_content().strip()
        yield {
            'recipient_id': recipient_id,
            'recipient_name': recipient_name,
            'recipient_location': recipient_location,
            'scheme': scheme,
            'amount': amount,
            'year': year,
            'currency': 'EUR',
            'country': 'IE'
        }

In [8]:
df = pd.DataFrame()
for gen in search_year(YEAR):
    df = pd.concat([df, pd.DataFrame(gen)])
    df.to_csv('ie_%s.csv.gz' % YEAR, compression='gzip', index=False)

Searching 2017 CARLOW
Searching 2017 CAVAN
Searching 2017 CLARE
Searching 2017 CORK
Searching 2017 DONEGAL
Searching 2017 DUBLIN
Searching 2017 GALWAY
Searching 2017 KERRY
Searching 2017 KILDARE
Searching 2017 KILKENNY
Searching 2017 LAOIS
Searching 2017 LEITRIM
Searching 2017 LIMERICK
Searching 2017 LONGFORD
Searching 2017 LOUTH
Searching 2017 MAYO
Searching 2017 MEATH
Searching 2017 MONAGHAN
Searching 2017 NORTHERN IRELAND
Searching 2017 OFFALY
Searching 2017 OUTSIDE IRELAND
Searching 2017 ROSCOMMON
Searching 2017 SLIGO
Searching 2017 TIPPERARY
Searching 2017 WATERFORD
Searching 2017 WESTMEATH
Searching 2017 WEXFORD
Searching 2017 WICKLOW


In [9]:
df.head()

Unnamed: 0,amount,country,currency,recipient_id,recipient_location,recipient_name,scheme,year
0,167910.74,IE,EUR,roscat-tillage-co-ltd-carlow,CARLOW,ROSCAT TILLAGE CO LTD,Direct Payments,2017
1,31801.3,IE,EUR,roscat-tillage-co-ltd-carlow,CARLOW,ROSCAT TILLAGE CO LTD,Investments in Physical Assets,2017
2,4249.99,IE,EUR,richard-cope-carlow,CARLOW,RICHARD COPE,Agri Environment Climate,2017
3,194746.16,IE,EUR,richard-cope-carlow,CARLOW,RICHARD COPE,Direct Payments,2017
4,4249.95,IE,EUR,george-byrne-carlow,CARLOW,GEORGE BYRNE,Agri Environment Climate,2017
