Python Jupyter Notebook that combines two previously separated algorithm for:
1. Taking imported raw polls and converting to vote shares for every candidate in every constituency
2. Analysing election results for a range of specified dates

In [1]:
# Algorithm details
AlgorithmName = "ComboPollElectionAlogrithm"
AlgorithmVersion = "2_8"
AlgorithmDate = "20240624"

# Version 0_1 initial test combination of version 1_1 of the election analysis algorithm and version 3_1 of the poll analysis algorithm
# Version 0_2 incorporates a loop around different analysis dates and the ability to delete analyses if they already exist for a particular date
# Version 0_3 includes a check to use as the analysis start date: the next date after the last date analysed in the database OR the date of the earliest poll to added, whichever is sooner
# Version 0_4 check of validity periods no incorporated into the overall loop so it changes for each analysis date
# Version 1_0 First full version after testing previous incorporations
# Version 1_1 Extension of detailed poll applicability to 100 days due to issues with NI polls
# Version 1_2 Updated if statement for earliest date to start an analysis from
# Version 1_3 All applicable polls for each rank is queried in one go now to stremaline the algorithm
# Version 2_0 Tested and checked streamlined version of 1_3
# Version 2_1 Change to an 'MRP' poll type to distinguish from the 'Constituency' poll type
# Version 2_2 Change so that polls time dela is less or EQUAL TO validity periods
# Version 2_3 Change of rank of all national poll type to 8
# Version 2_4 Change back of rank to 6 for more detailed national polls
# Version 2_5 Change of averaging methodology to take account for the number of polls at each rank
# Version 2_6 First test version of restructuring of the database to include a more distint MRP poll type
# Version 2_7 Version after first successful test of the poll import section
# Version 2_8 Successful test of integrating of new election analysis method for 1 date

# Construct PollAnalysisAlogrithm string
ElectionAnalysisAlgorithm = AlgorithmName + "_" + AlgorithmVersion + "_" + AlgorithmDate
PollAnalysisAlgorithm = AlgorithmName + "_" + AlgorithmVersion + "_" + AlgorithmDate

In [2]:
# Import required modules
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine
import urllib

In [3]:
# Set core variables for the rest of the algorithm
ElectionDate = datetime.date(2024, 7, 4) # Set the date of the election

AutoStartDateFlag = True # if true, use the next date after the last date analysed in the database
# OR the date of the earliest poll to added, whichever is sooner
AutoEndDateFlag = True # if true, use the date of the last poll

ManualStartDate = datetime.date(2024, 1, 1) # start date of election analyses if auto is not used
ManualEndDate = datetime.date(2024, 6, 21) # end date of election analyses if auto is not used

DeleteAnalysesFlag = True # if true, already existing analyses will be delete

DetailedRankThreshold = 5 # Variable for identifying from a poll's rank, whether it is a 'national' or 'detailed' poll

In [4]:
#Connect to database 'UK_General_Election' using SQlAlchemy
connection_str = "DRIVER={SQL SERVER};SERVER=DANZPOOTA;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
params = urllib.parse.quote_plus(connection_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect()

In [5]:
# Determine the start and end date for the election analysis loop
if AutoStartDateFlag == False:
    StartDate = ManualStartDate
else:
    LastAnalysisDateQuery = "SELECT TOP(1) ElectionPredictionDate FROM ElectionPredictionMeta ORDER BY ElectionPredictionDate DESC"
    LastAnalysisDateList = [i[0] for i in engine.execute(LastAnalysisDateQuery)]
    LastAnalysisDate = LastAnalysisDateList[0]
    
    EarliestNewPollQuery = "SELECT TOP(1) pm.PollDate from PollMeta AS pm LEFT JOIN PollAnalysisMeta AS pam ON pm.PollID = pam.PollID WHERE pam.PollID IS NULL ORDER BY pm.PollDate ASC"
    EarliestNewPollList = [i[0] for i in engine.execute(EarliestNewPollQuery)]
    try:
        EarliestNewPollDate = EarliestNewPollList[0]
    except:
        EarliestNewPollDate = ""
    
    if LastAnalysisDate < EarliestNewPollDate or EarliestNewPollDate == "":
        StartDate = pd.to_datetime(LastAnalysisDate).date() + timedelta(days=1)
    else:    
        StartDate = pd.to_datetime(EarliestNewPollDate).date()
    
if AutoEndDateFlag == False:
    EndDate = ManualEndDate
else:
    LastPollDateQuery = """SELECT TOP(1) PollDate from PollMeta Order by PollDate DESC"""
    LastPollDateList = [i[0] for i in engine.execute(LastPollDateQuery)]
    LastPollDate = LastPollDateList[0]
    EndDate = pd.to_datetime(LastPollDate).date()
    
print(StartDate)
print(EndDate)

2024-06-21
2024-06-26


In [22]:
# Get list of unanalysed polls to the loop around

PollsList =['20240624FindOutNowMRP632All']

In [23]:
%%time

# Main algorithm loop to convert poll vote shres into candidate vote shares
for PollID in PollsList:
    
    # Get the Poll Meta data from the database    
    PollMetaQuery = "SELECT * From PollMeta WHERE PollID = '<PollID>'"
    PollMetaQuery = PollMetaQuery.replace("<PollID>",PollID)
    PollMeta_df = pd.read_sql(PollMetaQuery,conn)
    
    # Get key variables for polls
    PollType = PollMeta_df.at[0,'PollType']
    PollScope = PollMeta_df.at[0,'PollScope']    
    
    #Query for generating list of applicable poll regions
    AllRegionsQuery = "SELECT RegionName FROM RegionRegionTypes WHERE RegionType = '<PollType>'"
    
    # Get from database all of the region vote shares relating to the PollID
    PollSharesQuery = "SELECT PollDetailsID, RegionName, Constituency, Party, VoteShare AS PollShare FROM PollDetails WHERE PollID = '<PollID>' ORDER BY RegionName, Party"
    PollSharesQuery = PollSharesQuery.replace("<PollID>",PollID)
    PollShares_df = pd.read_sql(PollSharesQuery,conn)
    
    # Check to see if it is a constituency only poll as this needs to be handled very differently
    # The alogrithm cannot be allowed to look for region shares as it will not find any for a constituency poll
    if PollType == "Constituency":
        ConstituencyPollFlag = True
        MRPPollFlag = False

        PreviousRegionSharesQuery = "SELECT CandidateID, Constituency, Party, PreviousShare FROM Candidates WHERE Constituency = '<Constituency>'"
        PreviousRegionSharesQuery = PreviousRegionSharesQuery.replace("<Constituency>",PollScope)
        PreviousRegionShares_df = pd.read_sql(PreviousRegionSharesQuery,conn)
        
        ConstituencyPollRegion = PollShares_df.at[0,'RegionName']      
        
        # Join the dataframes on 'Party'
        PreviousRegionShares_df = PreviousRegionShares_df.merge(PollShares_df[['Party','PollDetailsID','PollShare']], how='left', on='Party')

        # Determine swing
        PreviousRegionShares_df['Swing'] = PreviousRegionShares_df['PollShare'] - PreviousRegionShares_df['PreviousShare']
        
        ConstituencyShares_df = PreviousRegionShares_df.copy()
        ConstituencyShares_df['RegionName'] = ConstituencyShares_df['Constituency']
        
    elif PollType == "MRP632" or PollType == "MRP631":
        MRPPollFlag = True
        ConstituencyPollFlag = False
        
        PreviousRegionSharesQuery = """SELECT can.CandidateID, can.Constituency, can.Party, can.PreviousShare FROM Candidates AS can
        INNER JOIN Constituencies AS con ON con.ConstituencyName = can.Constituency
        INNER JOIN RegionConstituencies AS rc ON rc.ConstituencyName = con.ConstituencyName
        WHERE rc.RegionName = '<PollType>'"""
        PreviousRegionSharesQuery = PreviousRegionSharesQuery.replace("<PollType>",PollType)
        PreviousRegionShares_df = pd.read_sql(PreviousRegionSharesQuery,conn)
        
        PollShares_df['CandidateID'] = PollShares_df['Constituency'] + PollShares_df['Party']
        
        PreviousRegionShares_df = PreviousRegionShares_df.merge(PollShares_df[['CandidateID','PollDetailsID','PollShare']], how='left', on='CandidateID')
        
        PreviousRegionShares_df['Swing'] = PreviousRegionShares_df['PollShare'] - PreviousRegionShares_df['PreviousShare']
        
        ConstituencyShares_df = PreviousRegionShares_df.copy()
        ConstituencyShares_df['RegionName'] = ConstituencyShares_df['Constituency']
        
    else:
        ConstituencyPollFlag = False
        MRPPollFlag = False
        
        # Generate a list of all of the regions applicable to a poll when it is not a constituency one
        if PollScope == "All":
            AllRegionsQuery = AllRegionsQuery.replace("<PollType>",PollType)
            RegionsList = [i[0] for i in engine.execute(AllRegionsQuery)]
        else:
            RegionsList = [PollScope]     

        # Query for the region vote shares from the previous election
        PreviousRegionSharesQuery = """SELECT r.RegionName, can.Party, SUM(can.PreviousVotes) AS 'TotalVotes',
        CAST(SUM(can.PreviousVotes) AS FLOAT) / SUM(SUM(can.PreviousVotes)) OVER() AS 'RawPreviousShare',
        SUM(can.PreviousStanding) AS 'PreviousCandidates',
        SUM(can.CurrentStanding) AS 'CurrentCandidates'
        FROM Candidates AS can
        INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
        INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
        INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
        INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
        WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
        GROUP BY r.RegionName, can.Party
        ORDER BY r.RegionName, can.Party"""

        PreviousRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionType>",PollType)

        # Get from database all of the vote shares for the poll regions from the previous election
        PreviousRegionShares_df = pd.DataFrame(columns=["RegionName","Party","TotalVotes","RawPreviousShare","PreviousCandidates","CurrentCandidates"])

        # Loop through all regions applicable to this particular poll
        for Region in RegionsList:
            # Run query for the raw previous region shares
            ModRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionName>",Region)
            IndRegionShares_df = pd.read_sql(ModRegionSharesQuery,conn)  

            # Join this particular region's numbers to the overall dataframe for this poll
            PreviousRegionShares_df = pd.concat([PreviousRegionShares_df,IndRegionShares_df],axis=0)

        # Reset the index column
        PreviousRegionShares_df.reset_index(drop=True,inplace=True)

        # Replace all 0 values with 1000 to ensure the new vote share can be calculated without dividing by zero
        PreviousRegionShares_df['PreviousCandidates'].replace(0,1000,inplace=True)

        #Calculated the adjusted previous share based on the number of candidates actually standing
        PreviousRegionShares_df['PreviousShare'] = PreviousRegionShares_df['RawPreviousShare'] * PreviousRegionShares_df['CurrentCandidates']/PreviousRegionShares_df['PreviousCandidates']

        # Revert previous candidates back to zero
        PreviousRegionShares_df['PreviousCandidates'].replace(1000,0,inplace=True)    
    
        # Create a column to join the two dataframes
        PollShares_df['RegionParty'] = PollShares_df['RegionName'] + PollShares_df['Party']
        PreviousRegionShares_df['RegionParty'] = PreviousRegionShares_df['RegionName'] + PreviousRegionShares_df['Party']
        
        # Join the dataframes on 'RegionParty'
        PreviousRegionShares_df = PreviousRegionShares_df.merge(PollShares_df[['PollDetailsID','RegionParty','PollShare']], how='left', on='RegionParty')

        # Determine swing
        PreviousRegionShares_df['Swing'] = PreviousRegionShares_df['PollShare'] - PreviousRegionShares_df['PreviousShare']  

        # Query for pulling out previous election shares for applicable constituencies
        CandidatesQuery = """SELECT can.CandidateID, r.RegionName, can.Constituency, can.Party, can.PreviousShare
        FROM Candidates AS can
        INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
        INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
        INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
        INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
        WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
        AND can.CurrentStanding = 1
        ORDER BY can.Constituency, can.Party"""

        CandidatesQuery = CandidatesQuery.replace("<RegionType>",PollType)

        # Calculate the swings for every candidate in every constituency
        ConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","NewShareRaw","VoteShare"])
        IndConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","NewShareRaw","VoteShare"])

        # Need to cycle through every region to pull out the candidates for each in turn, then get the swing for each
        for Region in RegionsList:

            ModCandidatesQuery = CandidatesQuery.replace("<RegionName>",Region)
            IndConstituencyShares_df = pd.read_sql(ModCandidatesQuery,conn)   

            ConstituencyShares_df = pd.concat([ConstituencyShares_df,IndConstituencyShares_df],axis=0)

        ConstituencyShares_df.reset_index(drop=True,inplace=True)
    
        # Create 'RegionParty' to allow merging with the previous region shares dataframe
        ConstituencyShares_df['RegionParty'] = ConstituencyShares_df['RegionName'] + ConstituencyShares_df['Party']
        
        # Join the dataframes on 'RegionParty'
        ConstituencyShares_df = ConstituencyShares_df.merge(PreviousRegionShares_df[['PollDetailsID','RegionParty','Swing']], how='left', on='RegionParty')
                
    # Exit of Constituency poll IF statement    
    # Ensure the datatypes are numeric of the columns to be used in the calculation
    ConstituencyShares_df["PreviousShare"] = pd.to_numeric(ConstituencyShares_df["PreviousShare"])
    ConstituencyShares_df["Swing"] = pd.to_numeric(ConstituencyShares_df["Swing"])

    ConstituencyShares_df["NewShareRaw"] = ConstituencyShares_df["PreviousShare"] + ConstituencyShares_df["Swing"] 
    ConstituencyShares_df["NewShareRaw"] = np.where(ConstituencyShares_df["NewShareRaw"] < 0, 0,ConstituencyShares_df["NewShareRaw"])

    # Determine the factor needed to ensure vote shares for each constituency sum to 1
    ConstituencyShares_df['ConstRawShareTotals'] = ConstituencyShares_df['NewShareRaw'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

    # Modify the raw vote shares to ensure they sum to 1
    ConstituencyShares_df['VoteShare'] = ConstituencyShares_df['NewShareRaw']/ConstituencyShares_df['ConstRawShareTotals']

    ConstituencyShares_df['VoteShareCheck'] = ConstituencyShares_df['VoteShare'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

    # Create PollAnalysisMeta details for inserting into database
    PollAnalysisMeta_df = pd.DataFrame(columns=["PollID","PollAnalysisDate","PollAnalysisAlgorithm"])

    PollAnalysisMeta_df.at[0,"PollID"] = PollID
    PollAnalysisMeta_df.at[0,"PollAnalysisAlgorithm"] = PollAnalysisAlgorithm

    # The date of the analysis is always today's date
    PollAnalysisMeta_df.at[0,"PollAnalysisDate"] = datetime.date.today()

    PollAnalysisMeta_df.to_sql('PollAnalysisMeta', conn, if_exists='append', index=False)

    # Initial poll analysis values are now inserted into the database to allow these to be queried for the constituency shares
    PollAnalysisRegions_df = PreviousRegionShares_df[['PollDetailsID','Swing']].copy()

    # Get the recently inserted PollAnalysis ID from the database
    PollAnalysisIDQuery = "SELECT PollAnalysisID FROM PollAnalysisMeta WHERE PollID = '<PollID>'"
    PollAnalysisIDQuery = PollAnalysisIDQuery.replace("<PollID>",PollID)

    PollAnalysisID = [i[0] for i in engine.execute(PollAnalysisIDQuery)][0]
    PollAnalysisRegions_df['PollAnalysisID'] = PollAnalysisID

    PollAnalysisRegions_df.to_sql('PollAnalysisRegions', conn, if_exists='append', index=False)
    
    # Create the dataframe for insertion into the database and insert
    ConstituencyShares_df['PollAnalysisRegionID'] = ConstituencyShares_df['PollDetailsID'] + PollAnalysisID
    PollAnalysisConstituencies_df = ConstituencyShares_df[['PollAnalysisRegionID','CandidateID','VoteShare']]
    PollAnalysisConstituencies_df.to_sql('PollAnalysisConstituencies', conn, if_exists='append', index=False)

CPU times: total: 15.6 ms
Wall time: 1.15 s


Separation between the previoulsy segragated poll analysis and election analysis algorithms

In [12]:
#Close the connection with the database
conn.close()

In [8]:
PollAnalysisRegions_df

Unnamed: 0,PollDetailsID,Swing,PollAnalysisID
0,20240624FindOutNowMRP632AllWalesAberafan Maest...,-0.132564,2024062720240624FindOutNowMRP632AllComboPollEl...
1,20240624FindOutNowMRP632AllWalesAberafan Maest...,0.028244,2024062720240624FindOutNowMRP632AllComboPollEl...
2,20240624FindOutNowMRP632AllWalesAberafan Maest...,0.034119,2024062720240624FindOutNowMRP632AllComboPollEl...
3,20240624FindOutNowMRP632AllWalesAberafan Maest...,-0.014002,2024062720240624FindOutNowMRP632AllComboPollEl...
4,20240624FindOutNowMRP632AllWalesAberafan Maest...,0.005195,2024062720240624FindOutNowMRP632AllComboPollEl...
...,...,...,...
3799,20240624FindOutNowMRP632AllYorkshire and The H...,0.041922,2024062720240624FindOutNowMRP632AllComboPollEl...
3800,20240624FindOutNowMRP632AllYorkshire and The H...,0.185414,2024062720240624FindOutNowMRP632AllComboPollEl...
3801,20240624FindOutNowMRP632AllYorkshire and The H...,-0.096872,2024062720240624FindOutNowMRP632AllComboPollEl...
3802,20240624FindOutNowMRP632AllYorkshire and The H...,-0.001775,2024062720240624FindOutNowMRP632AllComboPollEl...


In [12]:
# Save the candidates dataframe to a csv for testing
PollAnalysisRegions_df.to_csv('C:/Users/danmu/Documents/Elections/2024_Python/PollAnalysisRegions.csv',encoding='utf-8')

In [9]:
PollAnalysisConstituencies_df

NameError: name 'PollAnalysisConstituencies_df' is not defined

In [21]:
PreviousRegionShares_df.filter(items=[494], axis=0)

Unnamed: 0,CandidateID,Constituency,Party,PreviousShare,PollDetailsID,PollShare,Swing
494,Bridlington and The WoldsCon,Bridlington and The Wolds,Con,0.665719,,,
