Jupyter notebook that looks at the polls currently in the model and compares them to the polls identified on the Wikipedia Opinion Polling Page

In [1]:
#Import required packages
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine
import urllib
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
#URL for UK general election polling
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_next_United_Kingdom_general_election"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

# Parse using BeautifulSoup to make the data more manageable
soup = BeautifulSoup(data,"html.parser")

# Find all the tables in the page
tables = soup.find_all('table')

# The UK polling table uses the UK party names in the table headings, unlike the other tables on the page
for index,thead in enumerate(tables):
    if ("Conservative Party (UK)" in str(thead)):
        uk_table_index = index
        break #Need a break to stop searching for the UK party as multiple tables have this in
        
for index,thead in enumerate(tables):
    if ("London Labour" in str(thead)):
        london_table_index = index
    
    if ("Scottish Conservatives" in str(thead)):
        scotland_table_index = index
        
    if ("Welsh Labour" in str(thead)):
        wales_table_index = index
        
print("UK:",uk_table_index,"London:",london_table_index,"Scotland:",scotland_table_index,"Wales:",wales_table_index)

UK: 1 London: 7 Scotland: 8 Wales: 9


In [3]:
# Create a dataframe with the content from the indexed table
# Only want Date, Pollster and SampleSize

uk_web_data = pd.DataFrame(columns=["Pollster", "SampleSize", "Date"])

for row in tables[uk_table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != [] and len(col) >=4):
        rawdate = col[0].text.strip()     

        # Modify the raw date into a usuable date form
        if len(rawdate) <= 6:
            calcdate = rawdate
        elif rawdate[len(rawdate)-6] == ' ' or rawdate[len(rawdate)-6] == '–':
            calcdate = rawdate[len(rawdate)-5:len(rawdate)]
        else:
            calcdate = rawdate[len(rawdate)-6:len(rawdate)]

        calcdate = calcdate + '2023'
        calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
        calcdate = datetime.strftime(calcdate, '%Y%m%d')     

        pollster = col[1].text.strip()
        
        try:
            samplesize = int((col[4].text.strip())[0:5].replace(",",""))
        except:
            samplesize = 0     
        
        new_web_data = pd.DataFrame({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate},index = [0])
        uk_web_data = pd.concat([uk_web_data,new_web_data],axis=0)
        
uk_web_data.reset_index(drop=True,inplace=True)
uk_web_data.tail()

Unnamed: 0,Pollster,SampleSize,Date
134,Omnisis,1285,20230106
135,YouGov,1709,20230105
136,Techne,1625,20230105
137,PeoplePolling,1209,20230104
138,Redfield & Wilton,2000,20230103


In [4]:
# Create a dataframe with the content from the indexed table
# Only want Date, Pollster and SampleSize

lon_web_data = pd.DataFrame(columns=["Pollster", "SampleSize", "Date"])

for row in tables[london_table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != [] and len(col) >=10):
        rawdate = col[0].text.strip()     

        # Modify the raw date into a usuable date form
        if rawdate[len(rawdate)-11] == ' ' or rawdate[len(rawdate)-11] == '–':
            calcdate = rawdate[len(rawdate)-10:len(rawdate)]
        else:
            calcdate = rawdate[len(rawdate)-11:len(rawdate)]

        calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
        calcdate = datetime.strftime(calcdate, '%Y%m%d')     

        pollster = col[1].text.strip()
        
        try:
            samplesize = int((col[3].text.strip())[0:5].replace(",",""))
        except:
            samplesize = 0      
        
        new_web_data = pd.DataFrame({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate},index = [0])
        lon_web_data = pd.concat([lon_web_data,new_web_data],axis=0)
        
lon_web_data.reset_index(drop=True,inplace=True)
lon_web_data.tail()

Unnamed: 0,Pollster,SampleSize,Date
12,YouGov,1192,20201119
13,Redfield & Wilton,2000,20201017
14,Redfield & Wilton,2000,20200908
15,Redfield & Wilton,2500,20200807
16,YouGov,1002,20200306


In [5]:
# Create a dataframe with the content from the indexed table
# Only want Date, Pollster and SampleSize

scot_web_data = pd.DataFrame(columns=["Pollster", "SampleSize", "Date"])

for row in tables[scotland_table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != [] and len(col) >=11):
        rawdate = col[0].text.strip()     

        # Modify the raw date into a usuable date form
        if rawdate[len(rawdate)-11] == ' ' or rawdate[len(rawdate)-11] == '–':
            calcdate = rawdate[len(rawdate)-10:len(rawdate)]
        else:
            calcdate = rawdate[len(rawdate)-11:len(rawdate)]

        calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
        calcdate = datetime.strftime(calcdate, '%Y%m%d')     

        pollster = col[1].text.strip()
        
        try:
            samplesize = int((col[3].text.strip())[0:5].replace(",",""))
        except:
            samplesize = 0
        
        new_web_data = pd.DataFrame({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate},index = [0])
        scot_web_data = pd.concat([scot_web_data,new_web_data],axis=0)
        
scot_web_data.reset_index(drop=True,inplace=True)
scot_web_data.tail()

Unnamed: 0,Pollster,SampleSize,Date
66,Panelbase,1026,20200703
67,Panelbase,1022,20200605
68,Panelbase,1086,20200505
69,YouGov,1095,20200427
70,Panelbase,1023,20200326


In [6]:
# Create a dataframe with the content from the indexed table
# Only want Date, Pollster and SampleSize

wales_web_data = pd.DataFrame(columns=["Pollster", "SampleSize", "Date"])

for row in tables[wales_table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != [] and len(col) >=12):
        rawdate = col[0].text.strip()     

        # Modify the raw date into a usuable date form
        if rawdate[len(rawdate)-11] == ' ' or rawdate[len(rawdate)-11] == '–':
            calcdate = rawdate[len(rawdate)-10:len(rawdate)]
        else:
            calcdate = rawdate[len(rawdate)-11:len(rawdate)]

        calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
        calcdate = datetime.strftime(calcdate, '%Y%m%d')     

        pollster = col[1].text.strip()
        
        try:
            samplesize = int((col[3].text.strip())[0:5].replace(",",""))
        except:
            samplesize = 0
        
        new_web_data = pd.DataFrame({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate},index = [0])
        wales_web_data = pd.concat([wales_web_data,new_web_data],axis=0)
        
wales_web_data.reset_index(drop=True,inplace=True)
wales_web_data.tail()

Unnamed: 0,Pollster,SampleSize,Date
16,YouGov Archived 3 November 2020 at the Wayback...,1013,20201030
17,YouGov,1110,20200904
18,YouGov,1021,20200601
19,YouGov,1008,20200407
20,YouGov,1037,20200126


In [7]:
#Connect to database 'UK_General_Election' using SQlAlchemy
connection_str = "DRIVER={SQL SERVER};SERVER=DANZPOOTA;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
params = urllib.parse.quote_plus(connection_str)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect()

In [8]:
conn.close()