In [65]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import aiohttp
import asyncio


### code to get all 12_015 links from ElectionsIreland.org

Takes about 14 minutes

In [66]:
request_failed = []

my_timeout = aiohttp.ClientTimeout(
    total=None, # default value is 5 minutes, set to `None` for unlimited timeout
    sock_connect=150, # How long to wait before an open socket allowed to connect
    sock_read=150 # How long to wait with no data being read before timing out
)

client_args = dict(
    trust_env=True,
    timeout=my_timeout
)

async def fetch(session,url,list_to_store_failed_requests):
    """Fetch a url, using specified ClientSession."""
    async with session.get(url) as response:
        # print(f"fetching {url}")
        try:
            resp = await response.read()
            return (url,resp)

        except asyncio.TimeoutError:
            list_to_store_failed_requests.append(url)
            print('timeout')
            return {"results": f"timeout error on {url}"}

        if response.status != 200:
            list_to_store_failed_requests.append(url)
            print('error')
            return {"error": f"server returned {response.status}"}

async def get_responses(session,urls,list_to_store_failed_requests):
    tasks = []
    for url in urls:
        tasks.append(fetch(session,url,list_to_store_failed_requests))
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    return {'responses':responses,'failed_requests':list_to_store_failed_requests}

urls = []
for i in range(1,12_016):
    url = f'https://www.electionsireland.org/candidate.cfm?ID={i}'
    urls.append(url)

connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.ClientSession(connector=connector,**client_args) as s:
    responses = await get_responses(s,urls,request_failed)

In [67]:
print('The request failed on these links:\n',request_failed)

The request failed on these links:
 []


### Extracting the data from the responses

In [68]:
def get_candidate_name(soup):
    name = soup.find('h1').text.strip()
    return name

In [69]:
def format_relatives_name(name):
    '''
    sometimes the name has a braket at the end of it because the designer of the site decided that only the words inside the braket was going to be put in a strong tag
    '''
    if name[-1] == '(':
        return name[:-1].strip()
    else:
        return name

def get_relatives(soup):
    relative_table = [table for table in soup.find_all('table',{'class':'rhtable'}) if 'Relatives' in table.text.strip()]
    if relative_table:
        relatives = []
        relatives_links = relative_table[0].find('td',{'class':'boundary'}).find_all('a')
        for link_tag in relatives_links:
            relative_dict = {}
            relative_dict['family_member'] = format_relatives_name(link_tag.contents[0].strip())
            relative_dict['relationship'] = link_tag.em.text.strip()
            relative_dict['family_member_ID'] = link_tag.get('href').split('=')[-1]
            relatives.append(relative_dict)
        return pd.DataFrame(relatives)
    else:
        return pd.DataFrame() 

#get_relatives(BeautifulSoup(r.content))

check for relatives.
biographical information

In [70]:
def process_row(row,column_names):
    '''
    Processes a row
    We have to use a manual count in the function because there are empty td tags in the table. 
    Therefore we only increase the count when needed. 
    '''
    d = {}
    count = 0
    for td in row.find_all('td'):
        if td.text.strip():
            if 'Unopposed' in td.text:
                d['ran_unopposed'] = True
                continue
            d[column_names[count]] = td.text.strip()
            count+=1

        elif td.get('align'): 
            # the person who made the site has td tags with no text in the table to seperate out the values, 
            # but to represent a td as actually being empty it will have no text but an align attribute.
            if td.find('img'): #if the td is empty but has a photo then that is the party logo
                d['party'] = td.find('img').get('title').strip() # party doesnt increase count, because party is not one of the columns in list of columns
            else:
                d[column_names[count]] = None
                count += 1  
    return d


def extract_candidate_table(soup):
    tables = soup.find_all('table',{'class':'rhtable'})
    main_table = tables[-2]
    
    list_of_rows = []
    column_names = ['date','election_type','status','constituency_name','seat','count_eliminated','first_pref_count','first_pref_pct','pct_of_quota_reached_with_first_pref']
    
    for row in main_table.find_all('tr')[2:]:
        if row.find('hr'):
            break
        d = process_row(row,column_names)
        list_of_rows.append(d)
    
    return list_of_rows 

def create_dataframe(list_of_rows):
    df = pd.DataFrame(list_of_rows)
    return df

def create_candidate_events_table(soup):
    list_of_dicts = extract_candidate_table(soup)
    df = create_dataframe(list_of_dicts)
    return df

#create_candidate_events_table(BeautifulSoup(r.content))        

In [71]:
r = requests.get('https://www.electionsireland.org/candidate.cfm?ID=15') #2048


def process_page(url,res_content):
    candidate_ID = url.split('=')[-1] 
    
    soup = BeautifulSoup(res_content)
    name = get_candidate_name(soup)
    df = create_candidate_events_table(soup)
    df['candidate'] = name
    df['candidate_ID'] = candidate_ID
    
    # family_dataframe = get_relatives(soup)
    # if family_dataframe.empty:
    #     return (df,pd.DataFrame())
    
    # family_dataframe['candidate'] = name
    # family_dataframe['candidate_ID'] = candidate_ID
    return df

process_page('https://www.electionsireland.org/candidate.cfm?ID=15',r.content)

Unnamed: 0,candidate,ID


takes about 20 minutes to run

In [72]:
dataframes = []
# family_dataframes = [] 
failed = []

for (url,resp) in responses['responses']:
    try:
        df = process_page(url,resp)
        dataframes.append(df)
        print('Sucess:',url)
        # if not family_df.empty:
        #     family_dataframes.append(family_df)

    except Exception as e:
        print('Failed:',url)
        print(e,'\n------------------------')
        failed.append(url)


Sucess: https://www.electionsireland.org/candidate.cfm?ID=1
Failed: https://www.electionsireland.org/candidate.cfm?ID=2
list index out of range 
------------------------
Sucess: https://www.electionsireland.org/candidate.cfm?ID=3
Sucess: https://www.electionsireland.org/candidate.cfm?ID=4
Sucess: https://www.electionsireland.org/candidate.cfm?ID=5
Sucess: https://www.electionsireland.org/candidate.cfm?ID=6
Sucess: https://www.electionsireland.org/candidate.cfm?ID=7
Sucess: https://www.electionsireland.org/candidate.cfm?ID=8
Failed: https://www.electionsireland.org/candidate.cfm?ID=9
list index out of range 
------------------------
Failed: https://www.electionsireland.org/candidate.cfm?ID=10
list index out of range 
------------------------
Sucess: https://www.electionsireland.org/candidate.cfm?ID=11
Sucess: https://www.electionsireland.org/candidate.cfm?ID=12
Sucess: https://www.electionsireland.org/candidate.cfm?ID=13
Sucess: https://www.electionsireland.org/candidate.cfm?ID=14
Faile

We loop through until there are no failed URLs. 
There are about 72 urls that are just empty dataframes. So we will always have slightly less than 12_015 IDs on the website. 

In [75]:
rounds = 0
while failed:
    rounds+=1
    # send requests
    print('Round:',rounds,'\nNumber of failed links:',len(failed))
    connector = aiohttp.TCPConnector(limit=60)
    async with aiohttp.ClientSession(connector=connector,**client_args) as s:
        responses = await get_responses(s,failed,request_failed)
    for (url,resp) in responses['responses']:
        try:
            df = process_page(url,resp)
            dataframes.append(df)
            #print(df)
            #print('\nSuccess:',url)
            failed.remove(url)
        except Exception as e:
            #print('Failed:',url)
            print(e,'\n------------------------')

Round: 1 
Number of failed links: 3577
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index out of range 
------------------------
list index 

Put all the dataframes into 1 dataframe 

In [57]:
DF = pd.concat(dataframes)
DF = DF.reset_index().drop(columns='index')
#DF['count_eliminated'] = DF.count_eliminated.replace(r'[(,)]','',regex=True).astype(float)
#DF['first_pref_count'] = DF['first_pref_count'].str.replace(',','').astype(float)
DF['first_pref_pct'] = DF['first_pref_pct'].replace(r'[%,]','',regex=True).astype(float)/100
DF['pct_of_quota_reached_with_first_pref'] = DF['pct_of_quota_reached_with_first_pref'].astype(float)
DF['ran_unopposed'] = DF['ran_unopposed'].fillna(False)
DF['count_eliminated'] = DF.count_eliminated.replace('awaiting full update',np.nan).replace(r'[(,)]','',regex=True).astype(float)
bad_data = DF[DF['first_pref_count'] == '100.00%'].index
DF = DF.drop(bad_data)
DF['first_pref_count'] = DF['first_pref_count'].replace('declared ineligible',np.nan).replace('awaiting full update',np.nan).str.replace(',','').astype(float)
DF.shape

(30664, 13)

In [60]:
DF = DF.drop_duplicates()
DF.shape

(30070, 13)

Save to a parquet file.

In [78]:
DF.to_parquet('ElectionsIreland_candidate.parquet')

For every elections we want
- Number of Constituencies
- How many consituency do we have vote data on?
- What was the quota?
- What was the votes/quota in first count?
- What was the lowest votes/quota?
- What was the highest votes/quota?
- Who transfered to who (if you have transfer data)