In [5]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
%xmode Context

Exception reporting mode: Context


We want to scrape down all the election data that is on this site:

The urls on the site are in the form of 'https://www.electionsireland.org/result.cfm?election=2016&cons=54'
So we need a list of election years and a list of constituecny codes. 
Realistically finding constituency codes is hard, so we will just find the links to the election data for each constituency from this web page: https://www.electionsireland.org/results/general/07dail.cfm Luckily, these webpages are sequential, so we can do a loop from 1 to 34


### Step 1: We find a link to the election results for each constituency that was in that election

In [6]:
def get_constituency_urls(res):
    constituency_urls = []
    res.raise_for_status()
    soup = BeautifulSoup(res.content)
    for link in soup.find_all('a'):
        if 'cons=' in link.get('href'):
            address = link.get('href')
            constituency_urls.append('https://www.electionsireland.org'+address[5:])

    if constituency_urls:
        return constituency_urls 
    else:
        raise Exception('No Constituency URLS')

### Step 2: We process that constituceny 

In [7]:
def get_constituency_details(soup):
    '''
    this gets key details of the election in the constituency:
     - the date of the election
     - the constitency name in both english and irish
     - the number of seats available
     - number of candidates running
     - Quota: the number of votes needed to be elected
     - Electorate: Number of available votes 

    Arguments:
        constituency_url -- _s_
    '''       
    cons_name = soup.find('span',class_='title1').text.strip()
    election_date = soup.find('span',class_='title3').text.split(':')[1].strip()
    size_of_electorate = soup.find('td',{'colspan':'3'}).text.split(':')[1].strip().replace(',','')
    as_Gaeilge = soup.find('em').text.strip()

    seats_available = [element.text.strip().split(' ')[0].strip() for element in soup.find_all('td',{'colspan':'2'}) if 'Seat' in element.text.strip()][0]
    quota = soup.find('td',{'colspan':'3','align':'right'}).text.strip().split(':')[1].strip().replace(',','')
    number_of_candidates = soup.find('td',{'colspan':'2','align':'center'}).text.strip().split(' ')[0].strip() 
    d = {
        'election_date':election_date,
        'electorate': size_of_electorate,
        'number_of_candidates':number_of_candidates,
        'seats_available':seats_available,
        'constituency_name':cons_name,
        'constituency_name_as_Gaeilge':as_Gaeilge,
        'quota':quota
        }
    
    return d

In [8]:
def extract_candidate_table(soup):
    list_of_rows = []
    column_names = ['candidate','first_pref_count','first_pref_pct','pct_of_quota_reached_with_first_pref','elected_on_count','status','seat']
    break_condition = False
    table_body = soup.find('table',class_='rtable').tbody
    rows = table_body.find_all('tr')

    for row in rows[1:]:
        d = {}
        count = 0
        for td in row.find_all('td'):
            txt = td.text.strip()
            if 'Total'  in txt:
                break_condition = True
                break

            if len(txt)>=1:
                if '*' in txt or '§' in txt:
                    d['outgoing'] = 'TD' if txt == '*' else 'Senator'
                    continue
                
                elif '♀' in txt or'♂' in txt:
                    d['gender'] = 'Male' if '♂' in txt else 'Female'
                    continue
                
                elif 'Unopposed' in txt:
                    break_condition = True
                    
                # print(count,':',column_names[count],':',txt)    
                d[column_names[count]] = txt
                count+=1
            elif td.find('img'):
                d['party'] = td.find('img').get('title').strip()
        
        if d:
            list_of_rows.append(d)           
            # print(d,'\n==========================================')
        
        elif break_condition == True:
            break
    
    return list_of_rows

def extract_data_for_constitunecy(constituency_url_res):
    '''
    gets candidate names and votes 

    Arguments:
        constituency_url -- the url for the constituency

    Returns:
        returns a dataframe with columns:'Candidate','1st_Pref_count','1st_Pref_Share%','Quota','Elected_on_Count','Status','Seat' and Gender
    '''  
    soup = BeautifulSoup(constituency_url_res.content)
    if 'Apologies' in soup.text:
        raise Exception('No data on webpage!')
        #return pd.DataFrame(data ='missing data',index=[1],columns=['Candidate','1st_Pref','Share','Quota','Elected_on_Count','Status','Seat','Gender','Party','Outgoing'])
    
    overall_election_info = get_constituency_details(soup)
    list_of_rows = extract_candidate_table(soup)
    for dic in list_of_rows:
        dic.update(overall_election_info)
    df = pd.DataFrame(list_of_rows)
    # tidy up dataframe
    df.candidate.replace('\d','',regex=True,inplace=True)
    df.election_date = pd.to_datetime(df.election_date)
    df.first_pref_count = df.first_pref_count.str.replace(',','').astype(int)
    df.first_pref_pct = df.first_pref_pct.str.replace('%','').astype(float)/100
    return df

extract_data_for_constitunecy(
    requests.get('https://www.electionsireland.org/result.cfm?election=1973B&cons=182&ref=94')
    )

Unnamed: 0,candidate,party,first_pref_count,first_pref_pct,pct_of_quota_reached_with_first_pref,elected_on_count,status,gender,election_date,electorate,number_of_candidates,seats_available,constituency_name,constituency_name_as_Gaeilge,quota
0,Brendan Toal,Fine Gael,14535,0.4732,0.95,0,1.0,Male,1973-11-27,39647,4,1,Monaghan,(Ulster),15360
1,Rory O'Hanlon,Fianna Fail,13822,0.4499,0.9,0,,Male,1973-11-27,39647,4,1,Monaghan,(Ulster),15360
2,Patrick Mooney,Aontacht Eireann (Irish Unity),2187,0.0712,0.14,0,,Male,1973-11-27,39647,4,1,Monaghan,(Ulster),15360
3,David Vipond,Communist (Marxist-Leninist),175,0.0057,0.01,0,,Male,1973-11-27,39647,4,1,Monaghan,(Ulster),15360


In [9]:
election_dataframes = []
failed_to_extract = []
no_constituency_links = []

for i in range(1,34):# number of dails there have been
    
    if len(str(i))<2:
        dail_number = '0'+str(i)
    else:
        dail_number=str(i)
    res = requests.get(f'https://www.electionsireland.org/results/general/{dail_number}dail.cfm')
    #print('============================================',f'https://www.electionsireland.org/results/general/{dail_number}dail.cfm','============================================')
    try:
        constituency_urls = get_constituency_urls(res)
    except:
        no_constituency_links.append(res.request.url)
        continue
    try:
        for constituency_url in constituency_urls:
            #print('--------------------------------------------',constituency_url,'--------------------------------------------')
            constituency_url_res = requests.get(constituency_url)
            df = extract_data_for_constitunecy(constituency_url_res)
            election_dataframes.append(df)
    except:
            failed_to_extract.append(constituency_url_res.request.url)
            


In [10]:
DF = pd.concat(election_dataframes)
DF.to_parquet('dail_elections_v1.parquet')

In [11]:
no_constituency_links

['https://www.electionsireland.org/results/general/20dail.cfm',
 'https://www.electionsireland.org/results/general/21dail.cfm',
 'https://www.electionsireland.org/results/general/22dail.cfm',
 'https://www.electionsireland.org/results/general/23dail.cfm',
 'https://www.electionsireland.org/results/general/24dail.cfm',
 'https://www.electionsireland.org/results/general/25dail.cfm',
 'https://www.electionsireland.org/results/general/26dail.cfm',
 'https://www.electionsireland.org/results/general/27dail.cfm',
 'https://www.electionsireland.org/results/general/28dail.cfm',
 'https://www.electionsireland.org/results/general/29dail.cfm',
 'https://www.electionsireland.org/results/general/30dail.cfm',
 'https://www.electionsireland.org/results/general/31dail.cfm',
 'https://www.electionsireland.org/results/general/32dail.cfm']

### Step 3: Filling in missing values
It looks like we are missing information from the 20th Dail until the 32nd;
I suppose the reason there is no constituency URLs is because they used the same cons codes for the next 13 elections.

So I am going to iterate 13 steps through the election years strating from the last year that we do have data for: 1969.

We also do have data for 2020, so that step doesnt need to be re-done. Interesting to note that the cons codes change again in 2020. Wonder why. The guys who made this site calls himself a psephologist on his LinkedIn

In [12]:
res = requests.get('https://www.electionsireland.org/results/general/19dail.cfm')
urls = get_constituency_urls(res)
urls[:3]

['https://www.electionsireland.org/result.cfm?election=1969&cons=32',
 'https://www.electionsireland.org/result.cfm?election=1969&cons=36',
 'https://www.electionsireland.org/result.cfm?election=1969&cons=42']

In [13]:
res = requests.get('https://www.electionsireland.org/results/general/index.cfm')
print(res)
tables = pd.read_html(res.content,flavor='bs4')
election_years = tables[3].values.reshape(-1)
election_years = [year for year in election_years if re.search('\d',str(year))] # removes null values and "index"
election_years = [str(int(year)) if type(year) != str else year.replace(' ','').lower() for year in election_years]
election_years

<Response [200]>


['1918',
 '1921',
 '1922',
 '1923',
 '1927jun',
 '1927sep',
 '1932',
 '1933',
 '1937',
 '1938',
 '1943',
 '1944',
 '1948',
 '1951',
 '1954',
 '1957',
 '1961',
 '1965',
 '1969',
 '1973',
 '1977',
 '1981',
 '1982feb',
 '1982nov',
 '1987',
 '1989',
 '1992',
 '1997',
 '2002',
 '2007',
 '2011',
 '2016',
 '2020']

In [14]:
election_dataframes2 = []

for election in [year for year in election_years if int(year[:4]) > 1969][:13]:
    #print(election)
    for url in urls:
        if '1969B' in url: # we arent interested in Bi elections
            continue
        else:
            new_url = url.replace('1969',election)
        try:
            constituency_url_res = requests.get(new_url)
            df = extract_data_for_constitunecy(constituency_url_res)
            election_dataframes2.append(df)
            #print('worked',new_url)
        except:
            #print('failed:',new_url)
            failed_to_extract.append(constituency_url_res.request.url)
        

In [15]:
DF2 = pd.concat(election_dataframes2)
DF2.to_parquet('dail_elections_v2.parquet')

In [20]:
new_DF = pd.concat([DF,DF2])
new_DF.sort_values(by='election_date')
new_DF.elected_on_count = new_DF.elected_on_count.str.replace('(','').str.replace(')','').astype(int)


  new_DF.elected_on_count = new_DF.elected_on_count.str.replace('(','').str.replace(')','').astype(int)


In [21]:
new_DF.to_parquet('DAIL_elections_master.parquet')

In [22]:
new_DF

Unnamed: 0,candidate,party,first_pref_count,first_pref_pct,pct_of_quota_reached_with_first_pref,elected_on_count,status,seat,gender,election_date,electorate,number_of_candidates,seats_available,constituency_name,constituency_name_as_Gaeilge,quota,outgoing
0,Patrick Gaffney,Farmers,10875,0.3483,1.74,1,Made Quota,1,Male,1922-06-16,51012,6,4,Carlow Kilkenny,Ceatharlach Cill Chainnigh,6246,
1,W T Cosgrave,Pro-Treaty Sinn Féin,7071,0.2264,1.13,1,Made Quota,2,Male,1922-06-16,51012,6,4,Carlow Kilkenny,Ceatharlach Cill Chainnigh,6246,TD
2,Denis Gorey,Labour,6122,0.1960,0.98,2,Made Quota,3,Male,1922-06-16,51012,6,4,Carlow Kilkenny,Ceatharlach Cill Chainnigh,6246,
3,General Gerald O'Sullivan,Pro-Treaty Sinn Féin,2681,0.0859,0.43,4,Made Quota,4,Male,1922-06-16,51012,6,4,Carlow Kilkenny,Ceatharlach Cill Chainnigh,6246,TD
4,Edward Aylward,Anti-Treaty Sinn Féin,3365,0.1078,0.54,4,Not Elected,,Male,1922-06-16,51012,6,4,Carlow Kilkenny,Ceatharlach Cill Chainnigh,6246,TD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,Steve Matthews,Green/Comhaontas Glas,1350,0.0196,0.12,4,No expenses,,Male,2016-02-26,97858,16,5,Wicklow,Cill Mhantain,11468,
12,Anna Doyle,People Before Profit Alliance,539,0.0078,0.05,3,No expenses,,Female,2016-02-26,97858,16,5,Wicklow,Cill Mhantain,11468,
13,Charlie Keddy,Non party/Independent,447,0.0065,0.04,3,No expenses,,Male,2016-02-26,97858,16,5,Wicklow,Cill Mhantain,11468,
14,Katrina Hutchinson,Direct Democracy Ireland,168,0.0024,0.01,2,No expenses,,Female,2016-02-26,97858,16,5,Wicklow,Cill Mhantain,11468,


In [23]:
print('We do not have data for the following urls:\n',pd.Series(failed_to_extract).unique())

We do not have data for the following urls:
 ['https://www.electionsireland.org/result.cfm?election=1918&cons=3'
 'https://www.electionsireland.org/result.cfm?election=1921&cons=32'
 'https://www.electionsireland.org/result.cfm?election=1922&cons=42'
 'https://www.electionsireland.org/result.cfm?election=1923&cons=111'
 'https://www.electionsireland.org/missing.cfm?election=1927jun'
 'https://www.electionsireland.org/result.cfm?election=1927sep&cons=111'
 'https://www.electionsireland.org/result.cfm?election=1932&cons=111'
 'https://www.electionsireland.org/result.cfm?election=1933&cons=111'
 'https://www.electionsireland.org/result.cfm?election=1937&cons=122'
 'https://www.electionsireland.org/result.cfm?election=1938&cons=75'
 'https://www.electionsireland.org/result.cfm?election=1943&cons=122'
 'https://www.electionsireland.org/result.cfm?election=1944&cons=75'
 'https://www.electionsireland.org/result.cfm?election=1948&cons=32'
 'https://www.electionsireland.org/result.cfm?election