In [None]:
# Algorithm details
AlgorithmName = "ElectionAnalysisAlogrithm"
AlgorithmVersion = "0_1"
AlgorithmDate = "20230602"

# Construct PollAnalysisAlogrithm string
ElectionAnalysisAlgorithm = AlgorithmName + "_" + AlgorithmVersion + "_" + AlgorithmDate

In [None]:
# Import required modules
import pandas as pd
import datetime
import ipywidgets as widgets
import numpy as np

#Modules required for database access
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine
import urllib

In [None]:
# Boolean to set whether an election has been called yet or not
ElectionCalled = False

In [None]:
if ElectionCalled == True:
    NationalValidPeriod = 7
    DetailedValidPeriod = 30
else:
    NationalValidPeriod = 30
    DetailedValidPeriod = 90    

In [None]:
# Set the date of the prediction
ElectionPredictionDate_widget = widgets.DatePicker(
    description = 'Date to be analysed:',
    value = datetime.date.today()
    )
display(ElectionPredictionDate_widget)

In [None]:
ElectionPredictionDate = ElectionPredictionDate_widget.value

In [None]:
#Connect to database 'UK_General_Election' using SQlAlchemy
connection_str = "DRIVER={SQL SERVER};SERVER=DANZPOOTA;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
params = urllib.parse.quote_plus(connection_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect()

In [None]:
# Get the list of polls that have actually been analysed and incorporated into the database
AnalysedPollsQuery = """SELECT pam.PollID, pm.PollType, pm.PollScope, pm.PollDate, rt.RegionTypeRank  FROM PollAnalysisMeta AS pam
INNER JOIN PollMeta AS pm ON pm.PollID = pam.PollID
INNER JOIN RegionTypes AS rt ON rt.RegionType = pm.PollType"""

AnalysedPolls_df = pd.read_sql(AnalysedPollsQuery,conn)

# Convert the date column to datetime type
AnalysedPolls_df['PollDate'] = pd.to_datetime(AnalysedPolls_df['PollDate'])

# Add prediction date and convert to datetime date so that it can be used in a calcualtion
AnalysedPolls_df['PredictionDate'] = ElectionPredictionDate
AnalysedPolls_df['PredictionDate'] = pd.to_datetime(AnalysedPolls_df['PredictionDate'])

# Determine how many days from the prediction date a poll was taken
AnalysedPolls_df['DateDelta'] = AnalysedPolls_df['PredictionDate'] - AnalysedPolls_df['PollDate']

# Determine the applicability of each poll
AnalysedPolls_df['PollApplicability'] = np.where(AnalysedPolls_df['DateDelta'] > pd.Timedelta(DetailedValidPeriod, unit="d"),1,0)

# Assign a rank to each poll
AnalysedPolls_df['PollRank'] = np.where(AnalysedPolls_df['PollScope']=='All',AnalysedPolls_df['RegionTypeRank'],AnalysedPolls_df['RegionTypeRank']-1)

AnalysedPolls_df

In [None]:
delta = pd.Timedelta(90, unit="d")
print(delta)

Division between new workbook F and old workbook E

In [None]:
# Get the Poll Meta data from the database
PollID = PollWidget.value
PollMetaQuery = "SELECT * From PollMeta WHERE PollID = '<PollID>'"
PollMetaQuery = PollMetaQuery.replace("<PollID>",PollID)
PollMeta_df = pd.read_sql(PollMetaQuery,conn)

In [None]:
PollMeta_df

In [None]:
# Get key variables for polls
PollType = PollMeta_df.at[0,'PollType']
PollScope = PollMeta_df.at[0,'PollScope']
print(PollType,PollScope)

In [None]:
#Queries for generating list of applicable poll regions
AllQuery = "SELECT RegionName FROM RegionRegionTypes WHERE RegionType = '<PollType>'"

In [None]:
# Generate a list of all of the regions applicable to a poll
if PollScope == "All":
    AllQuery = AllQuery.replace("<PollType>",PollType)
    RegionsList = [i[0] for i in engine.execute(AllQuery)]
else:
    RegionsList = [PollScope]

print(RegionsList)

In [None]:
# Get from database all of the region vote shares relating to the PollID
PollRegionSharesQuery = "SELECT PollDetailsID, RegionName, Party, VoteShare AS PollShare FROM PollDetails WHERE PollID = '<PollID>' ORDER BY RegionName, Party"
PollRegionSharesQuery = PollRegionSharesQuery.replace("<PollID>",PollID)
PollRegionShares_df = pd.read_sql(PollRegionSharesQuery,conn)
PollRegionShares_df.tail()

In [None]:
# Query for the region vote shares from the previous election
PreviousRegionSharesQuery = """SELECT r.RegionName, can.Party, SUM(can.PreviousVotes) AS 'TotalVotes',
CAST(SUM(can.PreviousVotes) AS FLOAT) / SUM(SUM(can.PreviousVotes)) OVER() AS 'RawPreviousShare',
SUM(can.PreviousStanding) AS 'PreviousCandidates',
SUM(can.CurrentStanding) AS 'CurrentCandidates'
FROM Candidates AS can
INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
GROUP BY r.RegionName, can.Party
ORDER BY r.RegionName, can.Party"""

PreviousRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionType>",PollType)

In [None]:
# Get from database all of the vote shares for the poll regions from the previous election
PreviousRegionShares_df = pd.DataFrame(columns=["RegionName","Party","TotalVotes","RawPreviousShare","PreviousCandidates","CurrentCandidates"])

# Loop through all regions applicable to this particular poll
for Region in RegionsList:
    # Run query for the raw previous region shares
    ModRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionName>",Region)
    IndRegionShares_df = pd.read_sql(ModRegionSharesQuery,conn)  
    
    # Join this particular region's numbers to the overall dataframe for this poll
    PreviousRegionShares_df = pd.concat([PreviousRegionShares_df,IndRegionShares_df],axis=0)

# Reset the index column
PreviousRegionShares_df.reset_index(drop=True,inplace=True)

# Replace all 0 values with 1000 to ensure the new vote share can be calculated without dividing by zero
PreviousRegionShares_df['PreviousCandidates'].replace(0,1000,inplace=True)

#Calculated the adjusted previous share based on the number of candidates actually standing
PreviousRegionShares_df['PreviousShare'] = PreviousRegionShares_df['RawPreviousShare'] * PreviousRegionShares_df['CurrentCandidates']/PreviousRegionShares_df['PreviousCandidates']

# Revert previous candidates back to zero
PreviousRegionShares_df['PreviousCandidates'].replace(1000,0,inplace=True)

PreviousRegionShares_df.tail()

In [None]:
# Compare previous election votes shares to poll votes shares to determine swings
# Check the 2 extracted dataframes match
if PollRegionShares_df[['RegionName','Party']].equals(PreviousRegionShares_df[['RegionName','Party']]) == True:
    PollRegionShares_df['PreviousShare'] = PreviousRegionShares_df['PreviousShare'].values
    PollRegionShares_df['Swing'] = PollRegionShares_df['PollShare'] - PollRegionShares_df['PreviousShare']
else:
    print("NO MATCH!")

In [None]:
PollRegionShares_df.tail()

In [None]:
# Create PollAnalysisMeta details for inserting into database
PollAnalysisMeta_df = pd.DataFrame(columns=["PollID","PollAnalysisDate","PollAnalysisAlgorithm"])

PollAnalysisMeta_df.at[0,"PollID"] = PollID
PollAnalysisMeta_df.at[0,"PollAnalysisAlgorithm"] = PollAnalysisAlgorithm

# The date of the analysis is always today's date
PollAnalysisMeta_df.at[0,"PollAnalysisDate"] = datetime.date.today()

PollAnalysisMeta_df.to_sql('PollAnalysisMeta', conn, if_exists='append', index=False)

In [None]:
# Initial poll analysis values are now inserted into the database to allow these to be queried for the constituency shares
PollAnalysisRegions_df = PollRegionShares_df[['PollDetailsID','Swing']]

# Get the recently inserted PollAnalysis ID from the database
PollAnalysisIDQuery = "SELECT PollAnalysisID FROM PollAnalysisMeta WHERE PollID = '<PollID>'"
PollAnalysisIDQuery = PollAnalysisIDQuery.replace("<PollID>",PollID)

PollAnalysisID = [i[0] for i in engine.execute(PollAnalysisIDQuery)][0]
PollAnalysisRegions_df['PollAnalysisID'] = PollAnalysisID

PollAnalysisRegions_df.to_sql('PollAnalysisRegions', conn, if_exists='append', index=False)

In [None]:
# Query for pulling out previous election shares for applicable constituencies
CandidatesQuery = """SELECT can.CandidateID, r.RegionName, can.Constituency, can.Party, can.PreviousShare
FROM Candidates AS can
INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
AND can.CurrentStanding = 1
ORDER BY can.Constituency, can.Party"""

CandidatesQuery = CandidatesQuery.replace("<RegionType>",PollType)

In [None]:
# Query for getting applicable swings for current party and region
SwingQuery = """SELECT par.PollAnalysisRegionID, par.Swing FROM PollAnalysisRegions AS par
INNER JOIN PollAnalysisMeta AS pam ON pam.PollAnalysisID = par.PollAnalysisID
INNER JOIN PollDetails AS pd ON pd.PollDetailsID = par.PollDetailsID
WHERE pd.Party = '<Party>' AND pd.RegionName = '<RegionName>'
AND pam.PollAnalysisID = '<PollAnalysisID>'"""

SwingQuery = SwingQuery.replace('<PollAnalysisID>',PollAnalysisID)

In [None]:
# Calculate the swings for every candidate in every constituency
ConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","Swing","NewShareRaw","VoteShare"])
IndConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","Swing","NewShareRaw","VoteShare"])

# Need to cycle through every region to pull out the candidates for each in turn, then get the swing for each
for Region in RegionsList:
    
    ModCandidatesQuery = CandidatesQuery.replace("<RegionName>",Region)
    IndConstituencyShares_df = pd.read_sql(ModCandidatesQuery,conn)
    
    RegionSwingQuery = SwingQuery.replace("<RegionName>",Region)
    
    # Now cycle through each row of the IndConstituencyShares_df to get the relevant swings
    for i in range(0,len(IndConstituencyShares_df)):
        Party = IndConstituencyShares_df.iloc[i]["Party"]
        PartySwingQuery = RegionSwingQuery.replace("<Party>",Party)
        Swing = [i[1] for i in engine.execute(PartySwingQuery)][0]
        PollAnalysisRegionID = [i[0] for i in engine.execute(PartySwingQuery)][0]
        IndConstituencyShares_df.at[i,"Swing"] = Swing
        IndConstituencyShares_df.at[i,"PollAnalysisRegionID"] = PollAnalysisRegionID
    

    ConstituencyShares_df = pd.concat([ConstituencyShares_df,IndConstituencyShares_df],axis=0)
        
ConstituencyShares_df.reset_index(drop=True,inplace=True)

# Ensure the datatypes are numeric of the columns to be used in the calculation
ConstituencyShares_df["PreviousShare"] = pd.to_numeric(ConstituencyShares_df["PreviousShare"])
ConstituencyShares_df["Swing"] = pd.to_numeric(ConstituencyShares_df["Swing"])

ConstituencyShares_df["NewShareRaw"] = ConstituencyShares_df["PreviousShare"] + ConstituencyShares_df["Swing"] 
ConstituencyShares_df["NewShareRaw"] = np.where(ConstituencyShares_df["NewShareRaw"] < 0, 0,ConstituencyShares_df["NewShareRaw"])

# Determine the factor needed to ensure vote shares for each constituency sum to 1
ConstituencyShares_df['ConstRawShareTotals'] = ConstituencyShares_df['NewShareRaw'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

# Modify the raw vote shares to ensure they sum to 1
ConstituencyShares_df['VoteShare'] = ConstituencyShares_df['NewShareRaw']/ConstituencyShares_df['ConstRawShareTotals']

ConstituencyShares_df['VoteShareCheck'] = ConstituencyShares_df['VoteShare'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

ConstituencyShares_df.tail()

In [None]:
# Create the dataframe for insertion into the database and insert
PollAnalysisConstituencies_df = ConstituencyShares_df[['PollAnalysisRegionID','CandidateID','VoteShare']]
PollAnalysisConstituencies_df.to_sql('PollAnalysisConstituencies', conn, if_exists='append', index=False)

In [None]:
#Close the connection with the database
conn.close()