In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import aiohttp
import asyncio
from tqdm import tqdm # adds progress bar 
import urllib.parse # extract candidate ID



### code to get all 20_000 links in this

Takes about 14 minutes

Although, there appears to be other links that are not sequential. such as Councilor Rory O'Connor: https://www.irelandelection.com/candidate.php?candid=18829

In [13]:
number_of_links = 20
failed = [] # list to store failed attempts
my_timeout = aiohttp.ClientTimeout(
    total=None, # default value is 5 minutes, set to `None` for unlimited timeout
    sock_connect=150, # How long to wait before an open socket allowed to connect
    sock_read=150 # How long to wait with no data being read before timing out
)

client_args = dict(
    trust_env=True,
    timeout=my_timeout
)


async def fetch(session, url, pbar):
    """Fetch a url, using specified ClientSession."""
    async with session.get(url) as response:
        try:
            resp = await response.read()
            pbar.update(1)
            return (url, resp)
        except asyncio.TimeoutError:
            failed.append(url)
            print('timeout')
            pbar.update(1)
            return {"results": f"timeout error on {url}"}
        if response.status != 200:
            failed.append(url)
            print('error')
            pbar.update(1)
            return {"error": f"server returned {response.status}"}

async def get_responses(session, pbar):
    tasks = []
    for i in range(1, number_of_links):
        url = f'https://www.irelandelection.com/candidate.php?candid={i}'
        tasks.append(fetch(session, url, pbar))
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    return responses

connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.ClientSession(connector=connector,**client_args) as s:
    with tqdm(total=number_of_links) as pbar:
        responses = await get_responses(s, pbar)

 95%|█████████████████████████████████████████████████████████████████████████████▉    | 19/20 [00:00<00:00, 26.47it/s]


In [14]:
len(failed)

0

In [16]:
print('Failed on these links:\n',failed)

Failed on these links:
 []


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


### Functions for getting data from the page

In [7]:
def get_candidate_name(soup):
    name = soup.find('title').text.split(' - ')[0].strip()
    return name

def get_election_type(election_string):
    election_string = election_string.lower()
    if 'local election' in election_string:
        return 'LOCAL'
    elif 'general election' in election_string:
        return 'GENERAL'
    elif 'european election' in election_string:
        return 'EUROPEAN'
    elif 'presidential election' in election_string:
        return 'PRESIDENTIAL'
    elif 'by-election' in election_string:
        return 'BI-ELECTION'
    
def get_constituency_name(election_string):
    constituency =  election_string.split(' - ')[1].strip()
    return constituency

def process_row(row):
    d = {}
    count = 0
    column_names = ['election','first_pref_pct','first_pref_count','first_pref_quota_ratio','elected']
    for td in row.find_all('td'):
        txt = td.text
        if txt:
            d[column_names[count]] = txt.strip()
            count+=1
            d['elected'] = True if td.find('img') else False
        elif td.find('img').get('title'):
            d['party'] = td.find('img').get('title')
    return d
    
def extract_table(soup):
    list_of_rows = []
    table_body = soup.find('tbody')
    if not table_body:
        raise Exception('No Table Found')
    rows = table_body.find_all('tr')
    for row in rows:
        d = process_row(row)
        list_of_rows.append(d)
    return list_of_rows

def create_dataframe(name,list_of_rows,candid_id):
    df = pd.DataFrame(list_of_rows)
    df['elected'] = df.elected.fillna(False)
    df['year'] = df.election.apply(lambda election_string: int(election_string[:4]))
    df['candidate'] = name
    df['constituency'] = df.election.apply(get_constituency_name)
    df['election_type'] = df.election.apply(get_election_type)
    df['candid_id'] = 
    return df

def extract_data_from_page(res,candid_id):
    soup = BeautifulSoup(res)
    if len(list(soup.stripped_strings)) == 3:
        raise Exception('Empty Page')
    name = get_candidate_name(soup)
    id_rep = soup.find('id').text.split(' - ')[0].strip()
    rows = extract_table(soup)
    df = create_dataframe(name,rows,candid_id)
    return df

def get_candid_id(url) 
    
    parsed_url = urllib.parse.urlparse(url)
    query_string = parsed_url.query
    query_string_dict = urllib.parse.parse_qs(query_string)
    candid_id = query_string_dict['candid'][0]
    
    return candid_id

#extract_data_from_page()

In [23]:
list_of_dataframes = []
fails = []
for (url,resp) in responses:
    #print(url)
    try:
        candid_id = get_candid_id(url) # getting the id from url 
        dataframe = extract_data_from_page(resp,candid_id)
        list_of_dataframes.append(dataframe)
    except Exception as e: 
        print('Failed:',url)
        print(e,'\n------------------------')
        if str(e) == 'No Table Found':
            fails.append((url,resp))

https://www.irelandelection.com/candidate.php?candid=1
https://www.irelandelection.com/candidate.php?candid=2
https://www.irelandelection.com/candidate.php?candid=3
https://www.irelandelection.com/candidate.php?candid=4
https://www.irelandelection.com/candidate.php?candid=5
https://www.irelandelection.com/candidate.php?candid=6
https://www.irelandelection.com/candidate.php?candid=7
https://www.irelandelection.com/candidate.php?candid=8
https://www.irelandelection.com/candidate.php?candid=9
https://www.irelandelection.com/candidate.php?candid=10
https://www.irelandelection.com/candidate.php?candid=11
https://www.irelandelection.com/candidate.php?candid=12
https://www.irelandelection.com/candidate.php?candid=13
https://www.irelandelection.com/candidate.php?candid=14
https://www.irelandelection.com/candidate.php?candid=15
https://www.irelandelection.com/candidate.php?candid=16
https://www.irelandelection.com/candidate.php?candid=17
https://www.irelandelection.com/candidate.php?candid=18
h

have to run throught the failed ones before concatinating the dataframes. 

Fails happen for 2 reasons:
- The page is empty eg: https://www.irelandelection.com/candidate.php?candid=30
- The page didnt return anything because I sent so many requests that their server couldnt respond.

We send the requests back through again for the failed because of lack of tables

In [9]:
fails2 = []
tasks = []

connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.ClientSession(connector=connector,**client_args) as s:
    for (url,resp) in fails:
        tasks.append(fetch(s,url)) 
    responses2 = await asyncio.gather(*tasks, return_exceptions=True)


for (url,resp) in responses2:
    #print(url)
    try:
        dataframe = extract_data_from_page(resp)
        list_of_dataframes.append(dataframe)
    except Exception as e: 
        print('Failed:',url)
        print(e,'\n------------------------')
        if str(e) == 'No Table Found':
            fails2.append(
                (url,resp)
                )

In [10]:
print(fails2)

[]


In [11]:
DF = pd.concat(list_of_dataframes)
DF.dtypes

election                  object
elected                     bool
party                     object
first_pref_pct            object
first_pref_count          object
first_pref_quota_ratio    object
year                       int64
candidate                 object
constituency              object
election_type             object
dtype: object

In [41]:
DF['first_pref_pct'] = DF['first_pref_pct'].replace('Candidate for forthcoming election',None)
DF['first_pref_pct'] = DF['first_pref_pct'].str.replace('%','').astype(float)/100
DF['first_pref_count'] = DF.first_pref_count.fillna(0).astype(int)
DF['first_pref_quota_ratio'] = DF.first_pref_quota_ratio.astype(float)
DF.dtypes

election                   object
elected                      bool
party                      object
first_pref_pct            float64
first_pref_count            int32
first_pref_quota_ratio    float64
year                        int64
candidate                  object
constituency               object
election_type              object
dtype: object

In [None]:
DF = DF.reset_index().drop(columns=['index'])

In [43]:
DF.to_parquet('ALL_CANDIDATES.parquet')

For every elections we want
- Number of Constituencies
- How many consituency do we have vote data on?
- What was the quota?
- What was the votes/quota in first count?
- What was the lowest votes/quota?
- What was the highest votes/quota?
- Who transfered to who (if you have transfer data)