In [1]:
# Algorithm details
AlgorithmName = "PollAnalysisAlogrithm"
AlgorithmVersion = "1_0"
AlgorithmDate = "20230602"

# Construct PollAnalysisAlogrithm string
PollAnalysisAlgorithm = AlgorithmName + "_" + AlgorithmVersion + "_" + AlgorithmDate

In [2]:
# Import required modules
import pandas as pd
import datetime
import ipywidgets as widgets
import numpy as np

#Modules required for database access
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine
import urllib

In [3]:
#Connect to database 'UK_General_Election' using SQlAlchemy
connection_str = "DRIVER={SQL SERVER};SERVER=DANZPOOTA;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
params = urllib.parse.quote_plus(connection_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect()

In [4]:
# Request PollID from user - pull list of unanalysed polls from database
PollsListQuery = "SELECT A.PollID from PollMeta A LEFT JOIN PollAnalysisMeta B ON A.PollID = B.PollID WHERE B.PollID IS NULL"
PollsList = [i[0] for i in engine.execute(PollsListQuery)]

PollWidget = widgets.Dropdown(
    options = PollsList,
    rows = len(PollsList),
    description = 'Select poll to be analysed:'
    )
display(PollWidget)

Dropdown(description='Select poll to be analysed:', options=('20200126YouGovNationWales', '20200306YouGovLondo…

In [5]:
# Get the Poll Meta data from the database
PollID = PollWidget.value
PollMetaQuery = "SELECT * From PollMeta WHERE PollID = '<PollID>'"
PollMetaQuery = PollMetaQuery.replace("<PollID>",PollID)
PollMeta_df = pd.read_sql(PollMetaQuery,conn)

In [6]:
PollMeta_df

Unnamed: 0,PollID,Pollster,PollType,PollDate,PollScope,PollScopeAll,PollScopeRegion,PollScopeConst,SampleSize,PollLink
0,20200407YouGovWalesRegionAll,YouGov,WalesRegion,2020-04-07,All,True,,,1008,https://docs.cdn.yougov.com/kg4inoaeii/Results...


In [7]:
# Get key variables for polls
PollType = PollMeta_df.at[0,'PollType']
PollScope = PollMeta_df.at[0,'PollScope']
print(PollType,PollScope)

WalesRegion All


In [8]:
#Queries for generating list of applicable poll regions
AllQuery = "SELECT RegionName FROM RegionRegionTypes WHERE RegionType = '<PollType>'"

In [9]:
# Generate a list of all of the regions applicable to a poll
if PollScope == "All":
    AllQuery = AllQuery.replace("<PollType>",PollType)
    RegionsList = [i[0] for i in engine.execute(AllQuery)]
else:
    RegionsList = [PollScope]

print(RegionsList)

['Cardiff and South Central Wales', 'Mid and West Wales', 'North Wales', 'South East Wales', 'South West Wales']


In [10]:
# Get from database all of the region vote shares relating to the PollID
PollRegionSharesQuery = "SELECT PollDetailsID, RegionName, Party, VoteShare AS PollShare FROM PollDetails WHERE PollID = '<PollID>' ORDER BY RegionName, Party"
PollRegionSharesQuery = PollRegionSharesQuery.replace("<PollID>",PollID)
PollRegionShares_df = pd.read_sql(PollRegionSharesQuery,conn)
PollRegionShares_df.tail()

Unnamed: 0,PollDetailsID,RegionName,Party,PollShare
30,20200407YouGovWalesRegionAllSouth West WalesLab,South West Wales,Lab,0.38
31,20200407YouGovWalesRegionAllSouth West WalesLD,South West Wales,LD,0.04
32,20200407YouGovWalesRegionAllSouth West WalesOther,South West Wales,Other,0.0
33,20200407YouGovWalesRegionAllSouth West WalesPC,South West Wales,PC,0.15
34,20200407YouGovWalesRegionAllSouth West WalesRe...,South West Wales,Reform,0.01


In [11]:
# Query for the region vote shares from the previous election
PreviousRegionSharesQuery = """SELECT r.RegionName, can.Party, SUM(can.PreviousVotes) AS 'TotalVotes',
CAST(SUM(can.PreviousVotes) AS FLOAT) / SUM(SUM(can.PreviousVotes)) OVER() AS 'RawPreviousShare',
SUM(can.PreviousStanding) AS 'PreviousCandidates',
SUM(can.CurrentStanding) AS 'CurrentCandidates'
FROM Candidates AS can
INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
GROUP BY r.RegionName, can.Party
ORDER BY r.RegionName, can.Party"""

PreviousRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionType>",PollType)

In [12]:
# Get from database all of the vote shares for the poll regions from the previous election
PreviousRegionShares_df = pd.DataFrame(columns=["RegionName","Party","TotalVotes","RawPreviousShare","PreviousCandidates","CurrentCandidates"])

# Loop through all regions applicable to this particular poll
for Region in RegionsList:
    # Run query for the raw previous region shares
    ModRegionSharesQuery = PreviousRegionSharesQuery.replace("<RegionName>",Region)
    IndRegionShares_df = pd.read_sql(ModRegionSharesQuery,conn)  
    
    # Join this particular region's numbers to the overall dataframe for this poll
    PreviousRegionShares_df = pd.concat([PreviousRegionShares_df,IndRegionShares_df],axis=0)

# Reset the index column
PreviousRegionShares_df.reset_index(drop=True,inplace=True)

# Replace all 0 values with 1000 to ensure the new vote share can be calculated without dividing by zero
PreviousRegionShares_df['PreviousCandidates'].replace(0,1000,inplace=True)

#Calculated the adjusted previous share based on the number of candidates actually standing
PreviousRegionShares_df['PreviousShare'] = PreviousRegionShares_df['RawPreviousShare'] * PreviousRegionShares_df['CurrentCandidates']/PreviousRegionShares_df['PreviousCandidates']

# Revert previous candidates back to zero
PreviousRegionShares_df['PreviousCandidates'].replace(1000,0,inplace=True)

PreviousRegionShares_df.tail()

Unnamed: 0,RegionName,Party,TotalVotes,RawPreviousShare,PreviousCandidates,CurrentCandidates,PreviousShare
30,South West Wales,Lab,139028,0.472529,8,8,0.472529
31,South West Wales,LD,14336,0.048725,8,8,0.048725
32,South West Wales,Other,1392,0.004731,2,8,0.018925
33,South West Wales,PC,20601,0.070019,8,8,0.070019
34,South West Wales,Reform,22132,0.075222,8,8,0.075222


In [13]:
# Compare previous election votes shares to poll votes shares to determine swings
# Check the 2 extracted dataframes match
if PollRegionShares_df[['RegionName','Party']].equals(PreviousRegionShares_df[['RegionName','Party']]) == True:
    PollRegionShares_df['PreviousShare'] = PreviousRegionShares_df['PreviousShare'].values
    PollRegionShares_df['Swing'] = PollRegionShares_df['PollShare'] - PollRegionShares_df['PreviousShare']
else:
    print("NO MATCH!")

In [14]:
PollRegionShares_df.tail()

Unnamed: 0,PollDetailsID,RegionName,Party,PollShare,PreviousShare,Swing
30,20200407YouGovWalesRegionAllSouth West WalesLab,South West Wales,Lab,0.38,0.472529,-0.092529
31,20200407YouGovWalesRegionAllSouth West WalesLD,South West Wales,LD,0.04,0.048725,-0.008725
32,20200407YouGovWalesRegionAllSouth West WalesOther,South West Wales,Other,0.0,0.018925,-0.018925
33,20200407YouGovWalesRegionAllSouth West WalesPC,South West Wales,PC,0.15,0.070019,0.079981
34,20200407YouGovWalesRegionAllSouth West WalesRe...,South West Wales,Reform,0.01,0.075222,-0.065222


In [15]:
# Create PollAnalysisMeta details for inserting into database
PollAnalysisMeta_df = pd.DataFrame(columns=["PollID","PollAnalysisDate","PollAnalysisAlgorithm"])

PollAnalysisMeta_df.at[0,"PollID"] = PollID
PollAnalysisMeta_df.at[0,"PollAnalysisAlgorithm"] = PollAnalysisAlgorithm

# The date of the analysis is always today's date
PollAnalysisMeta_df.at[0,"PollAnalysisDate"] = datetime.date.today()

PollAnalysisMeta_df.to_sql('PollAnalysisMeta', conn, if_exists='append', index=False)

1

In [16]:
# Initial poll analysis values are now inserted into the database to allow these to be queried for the constituency shares
PollAnalysisRegions_df = PollRegionShares_df[['PollDetailsID','Swing']]

# Get the recently inserted PollAnalysis ID from the database
PollAnalysisIDQuery = "SELECT PollAnalysisID FROM PollAnalysisMeta WHERE PollID = '<PollID>'"
PollAnalysisIDQuery = PollAnalysisIDQuery.replace("<PollID>",PollID)

PollAnalysisID = [i[0] for i in engine.execute(PollAnalysisIDQuery)][0]
PollAnalysisRegions_df['PollAnalysisID'] = PollAnalysisID

PollAnalysisRegions_df.to_sql('PollAnalysisRegions', conn, if_exists='append', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PollAnalysisRegions_df['PollAnalysisID'] = PollAnalysisID


-1

In [17]:
# Query for pulling out previous election shares for applicable constituencies
CandidatesQuery = """SELECT can.CandidateID, r.RegionName, can.Constituency, can.Party, can.PreviousShare
FROM Candidates AS can
INNER JOIN Constituencies as con ON con.ConstituencyName = can.Constituency
INNER JOIN RegionConstituencies AS rc ON  rc.ConstituencyName = con.ConstituencyName
INNER JOIN Regions AS r ON r.RegionName = rc.RegionName
INNER JOIN RegionRegionTypes AS rrt ON rrt.RegionName = r.RegionName
WHERE r.RegionName = '<RegionName>' AND rrt.RegionType = '<RegionType>'
AND can.CurrentStanding = 1
ORDER BY can.Constituency, can.Party"""

CandidatesQuery = CandidatesQuery.replace("<RegionType>",PollType)

In [18]:
# Query for getting applicable swings for current party and region
SwingQuery = """SELECT par.PollAnalysisRegionID, par.Swing FROM PollAnalysisRegions AS par
INNER JOIN PollAnalysisMeta AS pam ON pam.PollAnalysisID = par.PollAnalysisID
INNER JOIN PollDetails AS pd ON pd.PollDetailsID = par.PollDetailsID
WHERE pd.Party = '<Party>' AND pd.RegionName = '<RegionName>'
AND pam.PollAnalysisID = '<PollAnalysisID>'"""

SwingQuery = SwingQuery.replace('<PollAnalysisID>',PollAnalysisID)

In [19]:
# Calculate the swings for every candidate in every constituency
ConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","Swing","NewShareRaw","VoteShare"])
IndConstituencyShares_df = pd.DataFrame(columns=["CandidateID","RegionName","Constituency","Party","PreviousShare","Swing","NewShareRaw","VoteShare"])

# Need to cycle through every region to pull out the candidates for each in turn, then get the swing for each
for Region in RegionsList:
    
    ModCandidatesQuery = CandidatesQuery.replace("<RegionName>",Region)
    IndConstituencyShares_df = pd.read_sql(ModCandidatesQuery,conn)
    
    RegionSwingQuery = SwingQuery.replace("<RegionName>",Region)
    
    # Now cycle through each row of the IndConstituencyShares_df to get the relevant swings
    for i in range(0,len(IndConstituencyShares_df)):
        Party = IndConstituencyShares_df.iloc[i]["Party"]
        PartySwingQuery = RegionSwingQuery.replace("<Party>",Party)
        Swing = [i[1] for i in engine.execute(PartySwingQuery)][0]
        PollAnalysisRegionID = [i[0] for i in engine.execute(PartySwingQuery)][0]
        IndConstituencyShares_df.at[i,"Swing"] = Swing
        IndConstituencyShares_df.at[i,"PollAnalysisRegionID"] = PollAnalysisRegionID
    

    ConstituencyShares_df = pd.concat([ConstituencyShares_df,IndConstituencyShares_df],axis=0)
        
ConstituencyShares_df.reset_index(drop=True,inplace=True)

# Ensure the datatypes are numeric of the columns to be used in the calculation
ConstituencyShares_df["PreviousShare"] = pd.to_numeric(ConstituencyShares_df["PreviousShare"])
ConstituencyShares_df["Swing"] = pd.to_numeric(ConstituencyShares_df["Swing"])

ConstituencyShares_df["NewShareRaw"] = ConstituencyShares_df["PreviousShare"] + ConstituencyShares_df["Swing"] 
ConstituencyShares_df["NewShareRaw"] = np.where(ConstituencyShares_df["NewShareRaw"] < 0, 0,ConstituencyShares_df["NewShareRaw"])

# Determine the factor needed to ensure vote shares for each constituency sum to 1
ConstituencyShares_df['ConstRawShareTotals'] = ConstituencyShares_df['NewShareRaw'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

# Modify the raw vote shares to ensure they sum to 1
ConstituencyShares_df['VoteShare'] = ConstituencyShares_df['NewShareRaw']/ConstituencyShares_df['ConstRawShareTotals']

ConstituencyShares_df['VoteShareCheck'] = ConstituencyShares_df['VoteShare'].groupby(ConstituencyShares_df['Constituency']).transform('sum')

ConstituencyShares_df.tail()

Unnamed: 0,CandidateID,RegionName,Constituency,Party,PreviousShare,Swing,NewShareRaw,VoteShare,PollAnalysisRegionID,ConstRawShareTotals,VoteShareCheck
271,Swansea WestLab,South West Wales,Swansea West,Lab,0.516132,-0.092529,0.423603,0.419313,20200407YouGovWalesRegionAllSouth West WalesLa...,1.010229,1.0
272,Swansea WestLD,South West Wales,Swansea West,LD,0.083533,-0.008725,0.074808,0.074051,20200407YouGovWalesRegionAllSouth West WalesLD...,1.010229,1.0
273,Swansea WestOther,South West Wales,Swansea West,Other,0.0,-0.018925,0.0,0.0,20200407YouGovWalesRegionAllSouth West WalesOt...,1.010229,1.0
274,Swansea WestPC,South West Wales,Swansea West,PC,0.055373,0.079981,0.135354,0.133983,20200407YouGovWalesRegionAllSouth West WalesPC...,1.010229,1.0
275,Swansea WestReform,South West Wales,Swansea West,Reform,0.055345,-0.065222,0.0,0.0,20200407YouGovWalesRegionAllSouth West WalesRe...,1.010229,1.0


In [20]:
# Create the dataframe for insertion into the database and insert
PollAnalysisConstituencies_df = ConstituencyShares_df[['PollAnalysisRegionID','CandidateID','VoteShare']]
PollAnalysisConstituencies_df.to_sql('PollAnalysisConstituencies', conn, if_exists='append', index=False)

-1

In [21]:
#Close the connection with the database
conn.close()