In [2]:
from selenium import webdriver
from selenium.webdriver.opera.options import Options
import json
import time
import re
import random
from bs4 import BeautifulSoup

# List of constituency names
constituency_names = open("../data/raw/constituencies.txt","r").readlines()

# Base URL
base_url = "https://election.news.sky.com/elections/general-election-2024/"

# Path to your WebDriver (adjust the path as necessary)
driver_path = r'../operadriver.exe'

# Set up Opera options
options = Options()
options.binary_location = r"C:\Users\chris\AppData\Local\Programs\Opera\opera.exe"  # Path to the Opera browser executable

# Initialize WebDriver
driver = webdriver.Opera(executable_path=driver_path, options=options)

def extract_constituency_name(page_source):
    start_h1tag = page_source.find("<h1")
    end_h1 = page_source.find("</h1>")
    start_h1 = start_h1tag + page_source[start_h1tag:].find(">")+1
    return page_source[start_h1: end_h1]

# Function to scrape data for a given constituency
def scrape_constituency_data(idx):
    url = f"{base_url}-{idx}/"
    driver.get(url)
    
    # Wait for the JavaScript to load
    time.sleep(1)  # You might need to adjust the sleep time based on your internet speed and page load time
    
    page_source = driver.page_source

    table_starts = [m.start() for m in re.finditer("<table",page_source)]
    table_ends = [m.start()+len("</table>") for m in re.finditer("</table>",page_source)]
    html_table = page_source[table_starts[1]:table_ends[1]]

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_table, 'html.parser')

    # Find table headers (column names)
    header_row = soup.find('thead').find('tr')
    headers = [header.text.strip() for header in header_row.find_all('th')]

    # Find all table rows in the tbody
    table_rows = soup.find('tbody').find_all('tr')

    # List to store JSON objects
    json_data = []

    # Iterate over each row
    for row in table_rows:
        # Extract data from each cell in the row
        cells = row.find_all('td')
        
        # Extract text from each span in the cell (skip visually hidden spans)
        data = []
        for cell in cells:
            spans = cell.find_all('span')
            cell_data = [span.text.strip() for span in spans if 'u-hide-visually' not in span.get('class', [])]
            
            # Remove duplicates by converting to a set and back to list
            cell_data_unique = list(set(cell_data))
            
            data.append(' '.join(cell_data_unique))  # Join unique entries into one
            
        # Create a dictionary using headers and row data
        row_data = dict(zip(headers, data))
        
        # Append the row data dictionary to the list
        json_data.append(row_data)    
    
    constituency_name = extract_constituency_name(page_source)

    return json_data, constituency_name

# Main loop to scrape data for all constituencies
all_data = {}
for i in range(650):
    data, constituency_name = scrape_constituency_data(i+1)
    if data:
        all_data[constituency_name] = data
    
    # Introduce a random delay to mimic human behavior
    time.sleep(random.uniform(1,3))

    # Optionally, save the data to a JSON file
    with open('election_results.json', 'w') as f:
        json.dump(all_data, f, indent=4)

# Close the WebDriver
driver.quit()

print("Scraping completed.")


Scraping completed.
