This Python Jupyter Notebook is used to import election poll details from Excel spreadsheets into the UK General Elections Model.

It also then checks to see if there are any polls on the Wikipedia page of UK general election opinion polling that have not been incorporated in the model using webscraping technqiues with the Beautiful Soup Python package

Wikipedia Opinion Polling page:
https://en.wikipedia.org/wiki/Opinion_polling_for_the_2024_United_Kingdom_general_election

In [1]:
#Import required Python packages
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine
import urllib
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
#Connect to database 'UK_General_Election' using SQlAlchemy
connection_str = "DRIVER={SQL SERVER};SERVER=DANZPOOTA;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
params = urllib.parse.quote_plus(connection_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect()

In [3]:
# Create a list of Excel file names to check for polls in:
Excelfiles = ["Constituency_Polls_Import","RedBlueWall_Polls_Import","London_Polls_Import","NI_Polls_Import_Actual","Scotland_Polls_Import","Wales_Polls_Import","UK_Polls_Import_2023_December","UK_Polls_Import_2024","MRP_Import"]

In [4]:
# Loop around all of the files in the list created above
for filename in Excelfiles:
    # Location of file to be used
    xls_path = "C:\\Users\\danmu\\Documents\\Elections\\2024_Python\\Polls_Imports\\<filename>.xlsx"
    xls_path = xls_path.replace('<filename>',filename)
    
    # Read the 'PollMeta' worksheet from Excel into a dataframe
    xls_pollmeta_df = pd.read_excel(xls_path, sheet_name='PollMeta', usecols = "A:I,K")
    
    # Delete any rows in the poll data that contains the default ID
    cond = xls_pollmeta_df['PollID']=='19000100ERROR'
    xls_pollmeta_df.drop(xls_pollmeta_df[cond].index, inplace = True)
    xls_pollmeta_df
    
    # Read the 'PollDetails' worksheet from Excel into a dataframe
    xls_polldetails_df = pd.read_excel(xls_path, sheet_name='PollDetails', usecols = "A:E")
    
    # Pull details of Polls already in database
    DBPollsQuery = """SELECT PollID FROM PollMeta"""
    DB_Polls_df = pd.read_sql(DBPollsQuery,conn)
    
    # Find which polls are in the database by merging the pulled information with the excel information
    pollsindb_df = xls_pollmeta_df.merge(DB_Polls_df['PollID'], how='inner', on='PollID')
    
    # Copy the Excel import poll meta information into a new dataframe that can be edited inplace
    import_pollmeta_df = xls_pollmeta_df.copy()

    # Copy the Excel import poll details information into a new dataframe that can be edited inplace
    import_polldetails_df = xls_polldetails_df.copy()

    # Set the index to 'PollID' so the polls in the database can be dropped
    import_pollmeta_df.set_index('PollID',inplace=True)
    import_pollmeta_df.drop(index=pollsindb_df['PollID'],axis=0,inplace=True)
    
    # Delete from poll details using a condition
    # The method used above for the poll meta data cannot be used as errors arise from duplicate indices
    cond = import_polldetails_df['PollID'].isin(pollsindb_df['PollID'])
    import_polldetails_df.drop(import_polldetails_df[cond].index, inplace = True)
    import_polldetails_df
    
    # The 'PollID' column is dropped from the meta df as this is automatically generated in the database
    import_pollmeta_df.reset_index(drop=True,inplace=True)
    
    # Insert the PollMeta info into the database
    import_pollmeta_df.to_sql('PollMeta', conn, if_exists='append', index=False)
    
    # Insert the PollDetails info into the database
    import_polldetails_df.to_sql('PollDetails', conn, if_exists='append', index=False)

In [5]:
import_pollmeta_df

Unnamed: 0,Pollster,PollDate,SampleSize,PollType,PollScopeAll,PollScopeRegion,PollScopeConst,PollScope,PollLink


In [6]:
import_polldetails_df

Unnamed: 0,PollID,RegionName,Constituency,Party,VoteShare


In [7]:
# Check to confirm that all polls in the database have both PollMeta and PollDetails populated
PollInfoQuery = """SELECT A.PollID from PollMeta A LEFT JOIN PollDetails B ON A.PollID = B.PollID WHERE B.PollID IS NULL"""
PollInfoQuery_df = pd.read_sql(PollInfoQuery,conn)
PollInfoQuery_df

Unnamed: 0,PollID


In [8]:
# Get a list of the pollsters names stored in the database
pollsters_list = [i[0] for i in engine.execute("select PollsterName from Pollsters")]

In [9]:
# List of tags for the different types of polls that are assessed against the web page
Tags = ['GB','London','Northern Ireland','Scotland','Wales','JLPRedWall', 'RWRedWall', 'RWBlueWall', 'MICBlueWall']

In [10]:
# Find all of the table indices to be assessed

#URL for UK general election polling
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2024_United_Kingdom_general_election"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

# Parse using BeautifulSoup to make the data more manageable
soup = BeautifulSoup(data,"html.parser")

# Find all the tables in the page
tables = soup.find_all('table')

# The UK polling table uses the UK party names in the table headings, unlike the other tables on the page
for index,thead in enumerate(tables):
    if ("Conservative Party (UK)" in str(thead)):
        uk_table_index = index
        break #Need a break to stop searching for the UK party as multiple tables have this in
        
for index,thead in enumerate(tables):
    if ("London Labour" in str(thead)):
        london_table_index = index
        
    if ("Democratic Unionist Party" in str(thead)):
        ni_table_index = index
    
    if ("Scottish Conservatives" in str(thead)):
        scotland_table_index = index
        
    if ("Welsh Labour" in str(thead)):
        wales_table_index = index
        
print("UK:",uk_table_index,"London:",london_table_index,"Scotland:",scotland_table_index,"Wales:",wales_table_index)

UK: 2 London: 10 Scotland: 14 Wales: 15


In [11]:
# Additional table indices that are not automatically found from the scraped web page
JLPRedWall_index = 36
RWRedWall_index = 37
MICBlueWall_index = 40
RWBlueWall_index = 42

In [12]:
# Put all of the table indices into a list to be cycled through
TableIndices = [uk_table_index, london_table_index, ni_table_index, scotland_table_index, wales_table_index,JLPRedWall_index, RWRedWall_index, RWBlueWall_index, MICBlueWall_index]

In [13]:
# SQL queries for each type of poll
SelectGBPollsQuery = """SELECT PollDate, Pollster FROM PollMeta
WHERE PollType = 'GB' OR PollType = 'Nation-NINA' OR PollType = 'ITL1Region-NINA'
OR PollType = 'GBFiveLonSouth' OR PollType = 'GBFiveMidWales' OR PollType = 'GBSix'
AND PollScope = 'All'
ORDER BY PollDate DESC"""

SelectLondonPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType Like '%London%' OR PollScope = 'London' ORDER BY PollDate DESC"

SelectNIPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollScope = 'Northern Ireland' ORDER BY PollDate DESC"

SelectScotPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'ScotlandRegion' OR PollScope = 'Scotland' ORDER BY PollDate DESC"

SelectWalesPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'WalesRegion' OR PollScope = 'Wales' ORDER BY PollDate DESC"

SelectJLPRWPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'JLPRedWall' ORDER BY PollDate DESC"

SelectRWRWPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'R&WRedWall' ORDER BY PollDate DESC"

SelectRWBWPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'R&WBlueWall' ORDER BY PollDate DESC"

SelectMICBWPollsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType = 'MICBlueWall' ORDER BY PollDate DESC"

Queries = [SelectGBPollsQuery, SelectLondonPollsQuery, SelectNIPollsQuery, SelectScotPollsQuery, SelectWalesPollsQuery, SelectJLPRWPollsQuery, SelectRWRWPollsQuery, SelectRWBWPollsQuery, SelectMICBWPollsQuery]

In [14]:
# Create a dataframe with: [Tag, Table Index, Database polls query]
loop_df = pd.DataFrame(columns=["Tag", "TableIndex", "Query"])
loop_df['Tag'] = Tags
loop_df['TableIndex'] = TableIndices
loop_df ['Query'] = Queries

In [15]:
NewPolls_df = pd.DataFrame(columns=["Pollster", "PollDate","Tag"])

# Main loop that goes through every poll type, scrapes the polls from the webpage and then comapres to the polls in the database
for i in range(0, len(loop_df)):
    TableIndex = loop_df.loc[i,'TableIndex']
    
    # Create a new webdata dataframe for each poll type
    WebData_df = pd.DataFrame(columns=["Pollster", "PollDate","Tag"])
    
    # Determine the maximum number of columns in the table
    for row in tables[TableIndex].tbody.find_all("tr"):
        col = row.find_all("th")
        no_cols = len(col)
        if no_cols > 0:
            break
            
    for row in tables[TableIndex].tbody.find_all("tr"):
        col = row.find_all("td")
        if (col != [] and len(col) == no_cols):
            
            # Scrape the date
            rawdate = col[0].text.strip()
            
            # Check if the end of the rawdate is a year or not by checking if its an integer
            try:
                int(int(rawdate[len(rawdate)-3:len(rawdate)]))                
            except:
                rawdate = rawdate + ' 2024'
            
            calcdate = rawdate
                
            # Modify the raw date into a usuable date form
            for ipos in range(0,len(rawdate)):             
                if rawdate[ipos:ipos+1] == '–' or rawdate[ipos:ipos+1] == '-':
                    if rawdate[ipos+1:ipos+2] == ' ':
                        calcdate = rawdate[ipos+2:len(rawdate)]
                    else:
                        calcdate = rawdate[ipos+1:len(rawdate)]
                    break
                    
            calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
            calcdate = datetime.strftime(calcdate, '%Y-%m-%d')  
            
            # Scrape the pollster
            pollster = col[1].text.strip()
            
            # Determine whether poll is an MRP one or not
            if "MRP" in pollster or "SRP" in pollster:
                tag = "MRP"
            else:
                tag = loop_df.loc[i,'Tag']
            
            for j in pollsters_list:          
                    
                if j in pollster:
                    pollster = j            
   
            NewWebData_df = pd.DataFrame({"Pollster":pollster, "PollDate":calcdate, "Tag":tag},index = [0])
            WebData_df = pd.concat([WebData_df,NewWebData_df],axis=0)
    
    # Read the polls already in the database
    Database_df = pd.read_sql(loop_df.loc[i,'Query'],conn)
    Database_df['Tag'] = loop_df.loc[i,'Tag']
    
    # Reset the index of the web data to enable true comparison of the dataframes
    WebData_df.reset_index(drop=True,inplace=True)
          
    # Compare the two dataframes for any differences   
    Diff_df = pd.concat([WebData_df,Database_df]).drop_duplicates(keep=False)
    NewPolls_df = pd.concat([NewPolls_df,Diff_df],axis=0)
    #print(len(Diff_df),"new",loop_df.loc[i,'Tag'],"polls found?")

    NewPolls_df.reset_index(drop=True,inplace=True)
    WebData_df.reset_index(drop=True,inplace=True)

NewPolls_df.reset_index(drop=True,inplace=True)
WebData_df.reset_index(drop=True,inplace=True)

In [16]:
TableIndex

40

In [17]:
# Show the MRP polls not found in the database
SelectMRPsQuery = "SELECT PollDate, Pollster FROM PollMeta WHERE PollType Like '%MRP%' ORDER BY PollDate DESC"
Database_df = pd.read_sql(SelectMRPsQuery,conn)
Database_df['Tag'] = 'MRP'
Database_df.replace(to_replace='FindOutNow',value='Electoral Calculus',inplace=True)
Diff_df = pd.concat([NewPolls_df[(NewPolls_df['Tag'] == 'MRP')],Database_df]).drop_duplicates(keep=False)
Diff_df

Unnamed: 0,Pollster,PollDate,Tag
0,Survation,2024-07-03,MRP


In [18]:
# Show the GB polls not found in the database
NewPolls_df[(NewPolls_df['PollDate'] > '2023-12-31') & (NewPolls_df['Tag'] == 'GB')]

Unnamed: 0,Pollster,PollDate,Tag
0,2024 general election,2024-07-04,GB


In [19]:
# Show the London polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'London')]

Unnamed: 0,Pollster,PollDate,Tag


In [20]:
# Show the Northern Ireland polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'Northern Ireland')]

Unnamed: 0,Pollster,PollDate,Tag


In [21]:
# Show the Scotland polls not found in the database
NewPolls_df[(NewPolls_df['PollDate'] > '2023-12-31') & (NewPolls_df['Tag'] == 'Scotland')]

Unnamed: 0,Pollster,PollDate,Tag


In [22]:
# Show the Wales polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'Wales')]

Unnamed: 0,Pollster,PollDate,Tag


In [23]:
# Show the JL Partners Red Wall polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'JLPRedWall')]

Unnamed: 0,Pollster,PollDate,Tag


In [24]:
# Show the R&W Red Wall polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'RWRedWall')]

Unnamed: 0,Pollster,PollDate,Tag


In [25]:
# Show the R&W Blue Wall polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'RWBlueWall')]

Unnamed: 0,Pollster,PollDate,Tag


In [26]:
# Show the More In Common Blue Wall polls not found in the database
NewPolls_df[(NewPolls_df['Tag'] == 'MICBlueWall')]

Unnamed: 0,Pollster,PollDate,Tag


In [27]:
conn.close()