Example jupyter notebook that demonstrates:
- Connecting to a Microsoft SQL Server
- Creating a sample data table in the database
- Web scraping data from a Wikipedia table
- Comparing scraped data with test data to identify new items

In [1]:
# Load SQL Magic
%load_ext sql

In [2]:
# Import required modules
import pandas as pd
import pyodbc
import sqlalchemy
import urllib
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [3]:
# Establish database connection
connection_str = "DRIVER={SQL SERVER};SERVER=LAPTOP-BJU2VQE0\SIMPLESERVER;DATABASE=UK_General_Election;TRUSTED_CONNECTION=YES"
connection_str_quoted = urllib.parse.quote_plus(connection_str)
connection_uri = 'mssql+pyodbc:///?odbc_connect={}'.format(connection_str_quoted)

%sql {connection_uri}

In [4]:
# Test Connection
%sql select * from PollTypes

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
Done.


PollType
Constituency
GB
ITL1Region
Ugipsos
UK


In [5]:
# Delete test data table in case it already exists
%sql DROP TABLE PollMetaData_WalesTest

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
Done.


[]

In [6]:
%%sql

CREATE TABLE PollMetaData_WalesTest
    (
    PollID AS CONVERT(CHAR(8),PollDate,112) + '_' + Pollster + '_' + PollType + '_' + PollScope PERSISTED PRIMARY KEY,
    PollDate Date NOT NULL,
    Pollster VARCHAR(50) FOREIGN KEY REFERENCES Pollsters(Pollster) NOT NULL,
    PollType VARCHAR(50) FOREIGN KEY REFERENCES PollTypes(PollType) NOT NULL,
    PollScope VARCHAR(50) FOREIGN KEY REFERENCES PollScopes(PollScope) NOT NULL,
    SampleSize INT
    )

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
Done.


[]

Create test data in the newly created table. This data includes non-Welsh polls - to be filtered out - and missing polls for the code to identify differences.

In [7]:
%%sql
INSERT INTO PollMetaData_WalesTest (Pollster, PollDate, PollType, PollScope, SampleSize) VALUES
    ('YouGov', '2021-12-16', 'ITL1Region', 'Wales', '1009'),
    ('YouGov', '2021-09-16', 'ITL1Region', 'Wales', '1071'),
    ('YouGov', '2021-05-04', 'ITL1Region', 'Wales', '1071'),
    ('YouGov', '2021-04-21', 'ITL1Region', 'Wales', '1142'),
    ('YouGov', '2021-03-19', 'ITL1Region', 'Wales', '1174'),
    ('YouGov', '2021-01-14', 'ITL1Region', 'Wales', '1018'),
    ('YouGov', '2020-10-30', 'ITL1Region', 'Wales', '1013'),
    ('YouGov', '2020-09-04', 'ITL1Region', 'Wales', '1110'),
    ('YouGov', '2020-06-01', 'ITL1Region', 'Wales', '1021'),
    ('YouGov', '2020-04-07', 'ITL1Region', 'Wales', '1008'),
    ('YouGov', '2020-01-26', 'ITL1Region', 'Wales', '1037'),
    ('Retfield and Wilton', '2022-05-22', 'GB', 'All', '2000'),
    ('Techne UK', '2022-05-19', 'GB', 'All', '1635')

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
13 rows affected.


[]

In [8]:
%sql select * from PollMetaData_WalesTest

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
Done.


PollID,PollDate,Pollster,PollType,PollScope,SampleSize
20200126_YouGov_ITL1Region_Wales,2020-01-26,YouGov,ITL1Region,Wales,1037
20200407_YouGov_ITL1Region_Wales,2020-04-07,YouGov,ITL1Region,Wales,1008
20200601_YouGov_ITL1Region_Wales,2020-06-01,YouGov,ITL1Region,Wales,1021
20200904_YouGov_ITL1Region_Wales,2020-09-04,YouGov,ITL1Region,Wales,1110
20201030_YouGov_ITL1Region_Wales,2020-10-30,YouGov,ITL1Region,Wales,1013
20210114_YouGov_ITL1Region_Wales,2021-01-14,YouGov,ITL1Region,Wales,1018
20210319_YouGov_ITL1Region_Wales,2021-03-19,YouGov,ITL1Region,Wales,1174
20210421_YouGov_ITL1Region_Wales,2021-04-21,YouGov,ITL1Region,Wales,1142
20210504_YouGov_ITL1Region_Wales,2021-05-04,YouGov,ITL1Region,Wales,1071
20210916_YouGov_ITL1Region_Wales,2021-09-16,YouGov,ITL1Region,Wales,1071


In [9]:
%%sql db_data <<
select * from PollMetaData_WalesTest where PollScope = 'Wales'

 * mssql+pyodbc:///?odbc_connect=DRIVER%3D%7BSQL+SERVER%7D%3BSERVER%3DLAPTOP-BJU2VQE0%5CSIMPLESERVER%3BDATABASE%3DUK_General_Election%3BTRUSTED_CONNECTION%3DYES
Done.
Returning data to local variable db_data


In [10]:
db_data = db_data.DataFrame()
db_data

Unnamed: 0,PollID,PollDate,Pollster,PollType,PollScope,SampleSize
0,20200126_YouGov_ITL1Region_Wales,2020-01-26,YouGov,ITL1Region,Wales,1037
1,20200407_YouGov_ITL1Region_Wales,2020-04-07,YouGov,ITL1Region,Wales,1008
2,20200601_YouGov_ITL1Region_Wales,2020-06-01,YouGov,ITL1Region,Wales,1021
3,20200904_YouGov_ITL1Region_Wales,2020-09-04,YouGov,ITL1Region,Wales,1110
4,20201030_YouGov_ITL1Region_Wales,2020-10-30,YouGov,ITL1Region,Wales,1013
5,20210114_YouGov_ITL1Region_Wales,2021-01-14,YouGov,ITL1Region,Wales,1018
6,20210319_YouGov_ITL1Region_Wales,2021-03-19,YouGov,ITL1Region,Wales,1174
7,20210421_YouGov_ITL1Region_Wales,2021-04-21,YouGov,ITL1Region,Wales,1142
8,20210504_YouGov_ITL1Region_Wales,2021-05-04,YouGov,ITL1Region,Wales,1071
9,20210916_YouGov_ITL1Region_Wales,2021-09-16,YouGov,ITL1Region,Wales,1071


In [11]:
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_next_United_Kingdom_general_election"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

# Parse using BeautifulSoup to make the data more manageable
soup = BeautifulSoup(data,"html.parser")

# Find all the tables in the page
tables = soup.find_all('table')

# The Welsh polling table uses the Welsh party names in the table headings, unlike the other tables on the page
for index,thead in enumerate(tables):
    if ("Welsh Labour" in str(thead)):
        table_index = index
        
print(table_index)

8


In [12]:
# Create a dataframe with the content from the indexed table
# Only want Date, Pollster and SampleSize

raw_web_data = pd.DataFrame(columns=["Pollster", "SampleSize", "Date"])

ignore_rows = ['2019 general election','Election to the Senedd[10]','Andrew RT Davies becomes leader of the Welsh Conservatives[22]']

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != [] and col[1].text.strip() not in ignore_rows):
        rawdate = col[0].text.strip()     

        # Modify the raw date into a usuable date form
        if rawdate[len(rawdate)-11] == ' ' or rawdate[len(rawdate)-11] == '–':
            calcdate = rawdate[len(rawdate)-10:len(rawdate)]
        else:
            calcdate = rawdate[len(rawdate)-11:len(rawdate)]
        
        calcdate = datetime.strptime(calcdate.replace(" ",""), '%d%b%Y')        
        calcdate = datetime.strftime(calcdate, '%Y%m%d')        

        pollster = col[1].text.strip()        
        samplesize = int((col[3].text.strip())[0:5].replace(",",""))
        
        raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)

raw_web_data

  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.append({"Pollster":pollster, "SampleSize":samplesize, "Date":calcdate}, ignore_index=True)
  raw_web_data = raw_web_data.ap

Unnamed: 0,Pollster,SampleSize,Date
0,YouGov,1086,20220301
1,YouGov,1009,20211216
2,YouGov,1071,20210916
3,YouGov,1071,20210504
4,YouGov,1142,20210421
5,Opinium,2005,20210419
6,YouGov,1174,20210319
7,YouGov,1018,20210114
8,YouGov,1013,20201030
9,YouGov,1110,20200904


In [13]:
# Create the PollID field to enable comparison with the database test data
raw_web_data['PollID'] = raw_web_data['Date'] + '_' + raw_web_data['Pollster'] + '_' + 'ITL1Region' + '_' + 'Wales'
raw_web_data

Unnamed: 0,Pollster,SampleSize,Date,PollID
0,YouGov,1086,20220301,20220301_YouGov_ITL1Region_Wales
1,YouGov,1009,20211216,20211216_YouGov_ITL1Region_Wales
2,YouGov,1071,20210916,20210916_YouGov_ITL1Region_Wales
3,YouGov,1071,20210504,20210504_YouGov_ITL1Region_Wales
4,YouGov,1142,20210421,20210421_YouGov_ITL1Region_Wales
5,Opinium,2005,20210419,20210419_Opinium_ITL1Region_Wales
6,YouGov,1174,20210319,20210319_YouGov_ITL1Region_Wales
7,YouGov,1018,20210114,20210114_YouGov_ITL1Region_Wales
8,YouGov,1013,20201030,20201030_YouGov_ITL1Region_Wales
9,YouGov,1110,20200904,20200904_YouGov_ITL1Region_Wales


In [14]:
# Compare the two dataframes for any differences
# PollID is compared as this is the unique reference that identifies a poll
# The deliberate differences between the Wiki data and the test data are a YouGov poll from 1/3/22 and an Opinium poll 19/4/21
if db_data['PollID'].equals(raw_web_data['PollID']) == True:
    print('No new polls found!')
else:
    df_diff = pd.concat([db_data['PollID'],raw_web_data['PollID']]).drop_duplicates(keep=False)
    print('The following polls that are not currently in the database have been found:')
    print(df_diff)

The following polls that are not currently in the database have been found:
0     20220301_YouGov_ITL1Region_Wales
5    20210419_Opinium_ITL1Region_Wales
Name: PollID, dtype: object
