# 2024 Sri Lankan Presidential Election Dataset

- Data source: https://results.elections.gov.lk/index.php
- Dataset schema:
  - district: district name
  - division: polling division name
  - candidate: candidate name
  - party: party abbreviation
  - count: number of votes
  - percentage: percentage of votes

# Environment Setup


In [1]:
import requests
import polars as pl
import bs4

# Web Scraping

## Get districts and divisions
- A list of electoral districts and their respective divisions can be found on the sidebar of the results website

### Get district names

In [2]:
# parse html
url = "https://results.elections.gov.lk/index.php"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")

In [3]:
# find the names
districts = soup.select("#sidebar > ul > li > a > span")
districts = [i.text.strip() for i in districts]

# remove the first item ('All Island')
districts = districts[1:]

In [4]:
print(districts)

['Colombo', 'Gampaha', 'Kalutara', 'Mahanuwara', 'Matale', 'Nuwaraeliya', 'Galle', 'Matara', 'Hambantota', 'Jaffna', 'Vanni', 'Batticaloa', 'Digamadulla', 'Trincomalee', 'Kurunegala', 'Puttalam', 'Anuradhapura', 'Polonnaruwa', 'Badulla', 'Monaragala', 'Ratnapura', 'Kegalle']


### Get divisions of each district

In [5]:
district_to_divisions_map: dict[str, list[str]] = {}

for i, d in enumerate(districts):
    district_to_divisions_map[d] = []

    # get division elements by district id
    division_elements = soup.select(f"#district{i+1:02d} > ul > li > a > p")
    for j in division_elements:
        district_to_divisions_map[d].append(j.text.strip())

In [6]:
print(district_to_divisions_map)

{'Colombo': ['Preference', 'Colombo District', 'Postal Votes', 'ColomboNorth', 'ColomboCentral', 'Borella', 'ColomboEast', 'ColomboWest', 'Dehiwala', 'Ratmalana', 'Kolonnawa', 'Kotte', 'Kaduwela', 'Avissawella', 'Homagama', 'Maharagama', 'Kesbewa', 'Moratuwa'], 'Gampaha': ['Preference', 'Gampaha District', 'Postal Votes', 'Wattala', 'Negambo', 'Katana', 'Divulapitiya', 'Mirigama', 'Minuwangoda', 'Attanagalla', 'Gampaha', 'Ja-Ela', 'Mahara', 'Dompe', 'Biyagama', 'Kelaniya'], 'Kalutara': ['Preference', 'Kalutara District', 'Postal Votes', 'Panadura', 'Bandaragama', 'Horana', 'Bulathsinhala', 'Matugama', 'Kalutara', 'Beruwala', 'Agalawatta'], 'Mahanuwara': ['Preference', 'Mahanuwara District', 'Postal Votes', 'Galagedara', 'Harispattuwa', 'Patadumbara', 'Udadumbara', 'Teldeniya', 'Kundasale', 'Hewahata', 'Senkadagala', 'Mahanuwara', 'Yatinuwara', 'Udunuwara', 'Gampola', 'Nawalapitiya'], 'Matale': ['Preference', 'Matale District', 'Postal Votes', 'Dambulla', 'Laggala', 'Matale', 'Rattota']

### Cleaning

In [7]:
# remove 'Preference' and the '* District' divisions
# these have a different URL format and belong to their seperate datasets

for district, divisions in district_to_divisions_map.items():
    district_to_divisions_map[district] = [
        div
        for div in divisions
        if div != "Preference" and not div.endswith(" District")
    ]

## Get divisional results

**Regarding the choice of schema**
- The dataframe will be a denormalized flat table
- This was chosen over a normalized schema, containing multiple tables linked by foreign keys, because:
  - the number of rows is rather small
  - it is easier to query a flattened table
  - the goal of this project is to construct a dataset that facilitates analysis
- Postal votes are also included in the divisional results

In [8]:
# append results to a flat list of tuples of each candidate's results
results: list[tuple[str, str, str, str, int, float]] = []

In [9]:
for district, divisions in district_to_divisions_map.items():  # for each district
    # debug
    print(f"District: {district}\nDivisions: ", end="")
    for division in divisions:  # get each division's page
        url = "https://results.elections.gov.lk/division_results.php"
        response = requests.get(
            url, params={"district": district, "pd_division": division}
        )
        soup = bs4.BeautifulSoup(response.text, "html.parser")

        # get bulk data of each field and then transform into a list of tuples

        # get candidate names
        candidates_css_selector = "#overview > div > div.col-lg-8 > div > div > div > div > div.table-responsive.mt-1 > table:nth-child(1) > tbody > tr > td:nth-child(1) > div > div > h6"
        candidates = soup.select(candidates_css_selector)
        candidates = [candidate.text.strip() for candidate in candidates]

        # get party abbreviations
        party_css_selector = "#overview > div > div.col-lg-8 > div > div > div > div > div.table-responsive.mt-1 > table:nth-child(1) > tbody > tr > td:nth-child(2) > h6"
        parties = soup.select(party_css_selector)
        parties = [party.text.strip() for party in parties]

        # get vote counts
        votes_count_css_selector = "#overview > div > div.col-lg-8 > div > div > div > div > div.table-responsive.mt-1 > table:nth-child(1) > tbody > tr > td:nth-child(3) > p"
        votes_count = soup.select(votes_count_css_selector)
        votes_count = [int(vote.text.strip().replace(",", "")) for vote in votes_count]

        # get percentage of votes
        votes_percentage_css_selector = "#overview > div > div.col-lg-8 > div > div > div > div > div.table-responsive.mt-1 > table:nth-child(1) > tbody > tr > td:nth-child(4) > p"
        votes_percentage = soup.select(votes_percentage_css_selector)
        votes_percentage = [float(p.text.strip().rstrip("%")) for p in votes_percentage]

        # combine the data into row-wise tuples
        for candidate, party, votes_count, votes_percentage in zip(
            candidates, parties, votes_count, votes_percentage
        ):
            results.append(
                (district, division, candidate, party, votes_count, votes_percentage)
            )

        print("*", end="")
    print("\n")

District: Colombo
Divisions: ****************

District: Gampaha
Divisions: **************

District: Kalutara
Divisions: *********

District: Mahanuwara
Divisions: **************

District: Matale
Divisions: *****

District: Nuwaraeliya
Divisions: *****

District: Galle
Divisions: ***********

District: Matara
Divisions: ********

District: Hambantota
Divisions: *****

District: Jaffna
Divisions: ************

District: Vanni
Divisions: ****

District: Batticaloa
Divisions: ****

District: Digamadulla
Divisions: *****

District: Trincomalee
Divisions: ****

District: Kurunegala
Divisions: ***************

District: Puttalam
Divisions: ******

District: Anuradhapura
Divisions: ********

District: Polonnaruwa
Divisions: ****

District: Badulla
Divisions: **********

District: Monaragala
Divisions: ****

District: Ratnapura
Divisions: *********

District: Kegalle
Divisions: **********



# Convert to DataFrame


In [10]:
# save list of tuples to a polars dataframe
df_divisional_results = pl.DataFrame(
    results,
    schema={
        "district": str,
        "division": str,
        "candidate": str,
        "party": str,
        "votes_count": int,
        "votes_percentage": float,
    },
    orient="row",
)

print(df_divisional_results.shape)

(6916, 6)


In [11]:
# display summary
print(df_divisional_results.describe())

shape: (9, 7)
┌────────────┬──────────────┬─────────────┬───────────────────────┬───────┬─────────────┬──────────────────┐
│ statistic  ┆ district     ┆ division    ┆ candidate             ┆ party ┆ votes_count ┆ votes_percentage │
│ ---        ┆ ---          ┆ ---         ┆ ---                   ┆ ---   ┆ ---         ┆ ---              │
│ str        ┆ str          ┆ str         ┆ str                   ┆ str   ┆ f64         ┆ f64              │
╞════════════╪══════════════╪═════════════╪═══════════════════════╪═══════╪═════════════╪══════════════════╡
│ count      ┆ 6916         ┆ 6916        ┆ 6916                  ┆ 6916  ┆ 6916.0      ┆ 6916.0           │
│ null_count ┆ 0            ┆ 0           ┆ 0                     ┆ 0     ┆ 0.0         ┆ 0.0              │
│ mean       ┆ null         ┆ null        ┆ null                  ┆ null  ┆ 1925.913245 ┆ 2.631599         │
│ std        ┆ null         ┆ null        ┆ null                  ┆ null  ┆ 7588.897948 ┆ 9.22619          │
│ min

In [12]:
# sample the dataframe
df_divisional_results.sample(10)

district,division,candidate,party,votes_count,votes_percentage
str,str,str,str,i64,f64
"""Matara""","""Devinuwara""","""LALITH DE SILVA""","""UNFF""",11,0.02
"""Mahanuwara""","""Kundasale""","""K.K. PIYADASA""","""IND4""",224,0.27
"""Kalutara""","""Horana""","""SAJITH PREMADASA""","""SJB""",27090,24.29
"""Jaffna""","""Chavakachcheri""","""SARATH MANAMENDRA""","""NSU""",16,0.05
"""Digamadulla""","""Pothuvil""","""SARATH MANAMENDRA""","""NSU""",19,0.01
"""Polonnaruwa""","""Medirigiriya""","""A.S.P. LIYANAGE""","""SLLP""",5,0.01
"""Galle""","""Balapitiya""","""SAJITH PREMADASA""","""SJB""",12339,30.23
"""Kegalle""","""Rambukkana""","""K.K. PIYADASA""","""IND4""",117,0.22
"""Colombo""","""Borella""","""DILITH JAYAWEERA""","""SLCP""",659,1.45
"""Kegalle""","""Postal Votes""","""SARATH FONSEKA""","""IND12""",27,0.08


In [13]:
# write to disk
df_divisional_results.write_csv(
    "divisional_results.csv",
    include_header=True,
    separator=",",
    quote_style="non_numeric",
    quote_char='"',
)