In [1]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path

# assign a path for data files
RESULT_DIR = "../../data/processed/"

In [2]:
# scrape for the polling data
polls_res = requests.get('https://en.wikipedia.org/wiki/Statewide_opinion_polling_for_the_2020_United_States_presidential_election')
polls_soup = BeautifulSoup(polls_res.content, 'lxml')

In [3]:
# create a list of dataframes for the non-collapsable tables
dfs_notcollapsed = [pd.DataFrame() for i in range(len(polls_soup.find_all('table', {'class': 'wikitable sortable'})))]
for i in range(len(polls_soup.find_all('table', {'class': 'wikitable sortable'}))):
    dfs_notcollapsed[i] = pd.read_html(str(polls_soup.find_all('table', {'class': 'wikitable sortable'})[i]))[0]

# create a list of dataframes for the collapsable tables
dfs_collapsed = [pd.DataFrame() for i in range(len(polls_soup.find_all('table', {'class': 'wikitable sortable mw-collapsible mw-collapsed'})))]
for i in range(len(polls_soup.find_all('table', {'class': 'wikitable sortable mw-collapsible mw-collapsed'}))):
    dfs_collapsed[i] = pd.read_html(str(polls_soup.find_all('table', {'class': 'wikitable sortable mw-collapsible mw-collapsed'})[i]))[0]

In [4]:
# add the years for the states with missing years

# AZ
dfs_notcollapsed[5]['Date(s)administered'] = dfs_notcollapsed[5]['Date(s)administered'] + ', 2020'
dfs_collapsed[0]['Date(s)administered'] = dfs_collapsed[0]['Date(s)administered'] + ', 2019'

# MI
dfs_notcollapsed[43]['Date(s)administered'] = dfs_notcollapsed[43]['Date(s)administered'] + ', 2020'

# OH
dfs_collapsed[7]['Date(s)administered'] = dfs_collapsed[7]['Date(s)administered'] + ', 2020'
dfs_collapsed[8]['Date(s)administered'] = dfs_collapsed[8]['Date(s)administered'] + ', 2020'

# PA
dfs_notcollapsed[75]['Date(s)administered'] = dfs_notcollapsed[75]['Date(s)administered'] + ', 2020'

# WI
dfs_notcollapsed[96]['Date(s)administered'] = dfs_notcollapsed[96]['Date(s)administered'] + ', 2020'
dfs_collapsed[12]['Date(s)administered'] = dfs_collapsed[12]['Date(s)administered'] + ', 2019'

In [5]:
# assign each dataframe(s) to each state
AL = dfs_notcollapsed[1]
AK = dfs_notcollapsed[3]
AZ = pd.concat([dfs_notcollapsed[5], dfs_collapsed[0]], ignore_index = True)
AR = dfs_notcollapsed[7]
CA = dfs_notcollapsed[9]
CO = dfs_notcollapsed[11]
CT = dfs_notcollapsed[13]
DE = dfs_notcollapsed[15]
DC = dfs_notcollapsed[17]
FL = dfs_collapsed[1]
GA = dfs_collapsed[2]
HI = dfs_notcollapsed[21]
ID = dfs_notcollapsed[23]
IL = dfs_notcollapsed[25]
IN = dfs_notcollapsed[27]
IA = dfs_notcollapsed[29]
KS = dfs_notcollapsed[31]
KY = dfs_notcollapsed[33]
LA = dfs_notcollapsed[35]
ME = dfs_notcollapsed[37]
MD = dfs_notcollapsed[39]
MA = dfs_notcollapsed[41]
MI = pd.concat([dfs_notcollapsed[43], dfs_collapsed[3]], ignore_index = True)
MN = dfs_notcollapsed[45]
MS = dfs_notcollapsed[47]
MO = dfs_collapsed[4]
MT = dfs_notcollapsed[50]
NE = dfs_notcollapsed[52]
NV = dfs_notcollapsed[56]
NH = dfs_notcollapsed[58]
NJ = dfs_notcollapsed[60]
NM = dfs_notcollapsed[62]
NY = dfs_notcollapsed[64]
NC = pd.concat([dfs_notcollapsed[66], dfs_collapsed[5], dfs_collapsed[6]], ignore_index = True)
ND = dfs_notcollapsed[68]
OH = pd.concat([dfs_collapsed[7], dfs_collapsed[8], dfs_collapsed[9]], ignore_index = True)
OK = dfs_notcollapsed[71]
OR = dfs_notcollapsed[73]
PA = pd.concat([dfs_notcollapsed[75], dfs_collapsed[10]], ignore_index = True)
RI = dfs_notcollapsed[77]
SC = dfs_notcollapsed[79]
SD = dfs_notcollapsed[81]
TN = dfs_notcollapsed[83]
TX = dfs_collapsed[11]
UT = dfs_notcollapsed[86]
VT = dfs_notcollapsed[88]
VA = dfs_notcollapsed[90]
WA = dfs_notcollapsed[92]
WV = dfs_notcollapsed[94]
WI = pd.concat([dfs_notcollapsed[96], dfs_collapsed[12]], ignore_index = True)
WY = dfs_notcollapsed[98]

In [6]:
# aggregate the dataframes into a list of dataframes in state order
dfs = [AL, AK, AZ, AR, CA, CO, CT, DE, DC, FL, GA, HI, ID, IL, IN, IA, KS, KY, LA, ME, MD, MA, MI, MN, MS, MO, MT, NE, NV, NH, NJ, NM, NY, NC, ND, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VT, VA, WA, WV, WI, WY]

In [7]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)[1]
state

0     AL
1     AK
2     AZ
3     AR
4     CA
5     CO
6     CT
7     DE
8     DC
9     FL
10    GA
11    HI
12    ID
13    IL
14    IN
15    IA
16    KS
17    KY
18    LA
19    ME
20    MD
21    MA
22    MI
23    MN
24    MS
25    MO
26    MT
27    NE
28    NV
29    NH
30    NJ
31    NM
32    NY
33    NC
34    ND
35    OH
36    OK
37    OR
38    PA
39    RI
40    SC
41    SD
42    TN
43    TX
44    UT
45    VT
46    VA
47    WA
48    WV
49    WI
50    WY
Name: 1, dtype: object

In [8]:
# filter the dataframe to only include the state, month, year, democratic percentage, republican percentage, party winner, margin
polls_all = pd.DataFrame()
for i in range(len(dfs)):
    dfs[i]['state'] = [state[i]] * len(dfs[i])
    month = []
    year = []
    for j in dfs[i]['Date(s)administered']:
        if '–' in j:
            if j.split('–')[1][0] == ' ':
                month.append(j.split('–')[1][1:4])
            if j.split('–')[1][0] != ' ':
                month.append(j.split('–')[0][0:3])
        if '–' not in j:
            if 'Released' in j:
                month.append(j.split(' ')[1])
            else:
                month.append(j.split(' ')[0][0:3])
        year.append(int(j.split(' ')[-1].split('[')[0]))
    dfs[i]['month'] = month
    dfs[i]['year'] = year
    dem_percentage = []
    for j in dfs[i]['JoeBidenDemocratic']:
        dem_percentage.append(float(str(j).split('%')[0]))
    dfs[i]['dem_percentage'] = dem_percentage
    gop_percentage = []
    for j in dfs[i]['DonaldTrumpRepublican']:
        gop_percentage.append(float(str(j).split('%')[0]))
    dfs[i]['gop_percentage'] = gop_percentage
    winner = []
    margin = []
    for j in range(len(dfs[i])):
        if dfs[i]['dem_percentage'][j] > dfs[i]['gop_percentage'][j]:
            winner.append('D')
        if dfs[i]['dem_percentage'][j] < dfs[i]['gop_percentage'][j]:
            winner.append('R')
        if dfs[i]['dem_percentage'][j] == dfs[i]['gop_percentage'][j]:
            winner.append('E')
        margin.append(abs(dfs[i]['dem_percentage'][j] - dfs[i]['gop_percentage'][j]))
    dfs[i]['winner'] = winner
    dfs[i]['margin'] = margin
    polls_all = pd.concat([polls_all, dfs[i]], ignore_index = True)

In [9]:
polls_dr = polls_all[['state', 'month', 'year', 'dem_percentage', 'gop_percentage', 'winner', 'margin']]

# save to csv
polls_dr.to_csv(f'{RESULT_DIR}2020_polls_data.csv', index = False)

# show first 5 rows
polls_dr

Unnamed: 0,state,month,year,dem_percentage,gop_percentage,winner,margin
0,AL,Nov,2020,36.0,62.0,R,26.0
1,AL,Nov,2020,38.0,55.0,R,17.0
2,AL,Nov,2020,38.0,58.0,R,20.0
3,AL,Oct,2020,39.0,58.0,R,19.0
4,AL,Oct,2020,37.0,61.0,R,24.0
...,...,...,...,...,...,...,...
2095,WY,Oct,2020,31.0,68.0,R,37.0
2096,WY,Sep,2020,34.0,65.0,R,31.0
2097,WY,Aug,2020,25.0,74.0,R,49.0
2098,WY,Jul,2020,28.0,70.0,R,42.0


In [10]:
# create classification where for each state there exist a flip in polls, that state is classified as 'tossup'
classification = []
for i in state:
    if polls_dr[polls_dr['state'] == str(i)]['winner'].nunique() == 1:
        classification.append(list(polls_dr[polls_dr['state'] == i]['winner'])[0])
    else:
        classification.append('tossup')

In [11]:
polls = pd.DataFrame({'state': state, 'classification': classification})

# save to csv
polls.to_csv(f'{RESULT_DIR}2020_polls_classification.csv', index = False)

polls

Unnamed: 0,state,classification
0,AL,R
1,AK,R
2,AZ,tossup
3,AR,R
4,CA,D
5,CO,D
6,CT,D
7,DE,D
8,DC,D
9,FL,tossup


In [12]:
# count the number of tossups
(polls['classification'] == 'tossup').sum()

16

In [16]:
# convert the dataframe to latex
print(polls.to_latex(index = False))

\begin{tabular}{ll}
\toprule
state & classification \\
\midrule
   AL &              R \\
   AK &              R \\
   AZ &         tossup \\
   AR &              R \\
   CA &              D \\
   CO &              D \\
   CT &              D \\
   DE &              D \\
   DC &              D \\
   FL &         tossup \\
   GA &         tossup \\
   HI &              D \\
   ID &              R \\
   IL &              D \\
   IN &              R \\
   IA &         tossup \\
   KS &              R \\
   KY &              R \\
   LA &              R \\
   ME &              D \\
   MD &              D \\
   MA &              D \\
   MI &         tossup \\
   MN &         tossup \\
   MS &              R \\
   MO &         tossup \\
   MT &         tossup \\
   NE &              R \\
   NV &         tossup \\
   NH &         tossup \\
   NJ &              D \\
   NM &         tossup \\
   NY &              D \\
   NC &         tossup \\
   ND &              R \\
   OH &         tossup \\


  print(polls.to_latex(index = False))


In [17]:
# filter the dataframe to only include nov 2019 to nov 2020
polls_oneyear = polls_dr[(polls_dr['year'] == 2020) | ((polls_dr['year'] == 2019) & polls_dr['month'].isin(['Nov', 'Dec']))].reset_index(drop = True)
polls_oneyear

Unnamed: 0,state,month,year,dem_percentage,gop_percentage,winner,margin
0,AL,Nov,2020,36.0,62.0,R,26.0
1,AL,Nov,2020,38.0,55.0,R,17.0
2,AL,Nov,2020,38.0,58.0,R,20.0
3,AL,Oct,2020,39.0,58.0,R,19.0
4,AL,Oct,2020,37.0,61.0,R,24.0
...,...,...,...,...,...,...,...
1979,WY,Oct,2020,31.0,68.0,R,37.0
1980,WY,Sep,2020,34.0,65.0,R,31.0
1981,WY,Aug,2020,25.0,74.0,R,49.0
1982,WY,Jul,2020,28.0,70.0,R,42.0


In [18]:
# negative represents democratic, positive represents republican
party_margin = []
for i in range(len(polls_oneyear)):
    if polls_oneyear['winner'][i] == 'D':
        party_margin.append(-polls_oneyear['margin'][i])
    if any(polls_oneyear['winner'][i] in x for x in ['E', 'R']):
        party_margin.append(polls_oneyear['margin'][i])
polls_party_margin = polls_oneyear
polls_party_margin['2020_polls_margin'] = party_margin
polls_party_margin

Unnamed: 0,state,month,year,dem_percentage,gop_percentage,winner,margin,2020_polls_margin
0,AL,Nov,2020,36.0,62.0,R,26.0,26.0
1,AL,Nov,2020,38.0,55.0,R,17.0,17.0
2,AL,Nov,2020,38.0,58.0,R,20.0,20.0
3,AL,Oct,2020,39.0,58.0,R,19.0,19.0
4,AL,Oct,2020,37.0,61.0,R,24.0,24.0
...,...,...,...,...,...,...,...,...
1979,WY,Oct,2020,31.0,68.0,R,37.0,37.0
1980,WY,Sep,2020,34.0,65.0,R,31.0,31.0
1981,WY,Aug,2020,25.0,74.0,R,49.0,49.0
1982,WY,Jul,2020,28.0,70.0,R,42.0,42.0


In [19]:
# find the unweighted mean
polls_mean = polls_party_margin.groupby('state', sort = False)[['2020_polls_margin']].mean().reset_index(drop = False)
polls_mean

Unnamed: 0,state,2020_polls_margin
0,AL,20.277778
1,AK,6.533333
2,AZ,-2.808163
3,AR,24.222222
4,CA,-28.84375
5,CO,-12.514286
6,CT,-23.307692
7,DE,-25.555556
8,DC,-78.166667
9,FL,-2.859649


In [20]:
# export the dataframe to a csv file
polls_mean.to_csv('../../data/processed/2020_mean_polls.csv', index = False)

In [21]:
# convert the dataframe to latex
print(polls_mean.to_latex(index = False))

\begin{tabular}{lr}
\toprule
state &  2020\_polls\_margin \\
\midrule
   AL &          20.277778 \\
   AK &           6.533333 \\
   AZ &          -2.808163 \\
   AR &          24.222222 \\
   CA &         -28.843750 \\
   CO &         -12.514286 \\
   CT &         -23.307692 \\
   DE &         -25.555556 \\
   DC &         -78.166667 \\
   FL &          -2.859649 \\
   GA &          -0.335484 \\
   HI &         -30.333333 \\
   ID &          22.000000 \\
   IL &         -17.272727 \\
   IN &          11.666667 \\
   IA &           1.322034 \\
   KS &           9.190476 \\
   KY &          17.869565 \\
   LA &          18.583333 \\
   ME &         -12.838710 \\
   MD &         -31.538462 \\
   MA &         -36.615385 \\
   MI &          -6.998765 \\
   MN &          -8.714286 \\
   MS &          16.615385 \\
   MO &           7.136364 \\
   MT &           7.666667 \\
   NE &          11.000000 \\
   NV &          -4.000000 \\
   NH &          -9.030303 \\
   NJ &         -19.904762 \\


  print(polls_mean.to_latex(index = False))
