In [26]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [27]:
# scrape the web to get the dataframe
pres_2020_res = requests.get('https://en.wikipedia.org/wiki/2020_United_States_presidential_election')
pres_2020_soup = BeautifulSoup(pres_2020_res.content, 'lxml')

pres_2020_html = pres_2020_soup.find(text = 'Results by state').find_next(text = 'Results by state').find_next('table', {'class': 'wikitable sortable'})
pres_2020_df_raw = pd.DataFrame(pd.read_html(str(pres_2020_html))[0]).replace('–', 0)
pres_2020_df_raw

Unnamed: 0_level_0,State ordistrict,Biden/HarrisDemocratic,Biden/HarrisDemocratic,Biden/HarrisDemocratic,Trump/PenceRepublican,Trump/PenceRepublican,Trump/PenceRepublican,Jorgensen/CohenLibertarian,Jorgensen/CohenLibertarian,Jorgensen/CohenLibertarian,Hawkins/WalkerGreen,Hawkins/WalkerGreen,Hawkins/WalkerGreen,Others,Others,Others,Margin,Margin,Marginswing[m],Totalvotes
Unnamed: 0_level_1,State ordistrict,Votes,%,EV,Votes,%,EV,Votes,%,EV,Votes,%,EV,Votes,%,EV,Votes,%,%,Totalvotes
0,Alab.,849624,36.57%,0,1441170,62.03%,9,25176,1.08%,0,[n],[n],0,7312,0.31%,0,"−591,546",−25.46%,2.27%,2323282
1,Alaska,153778,42.77%,0,189951,52.83%,3,8897,2.47%,0,[o],[o],0,6904,1.92%,0,"−36,173",−10.06%,4.67%,359530
2,Arizona,1672143,49.36%,11,1661686,49.06%,0,51465,1.52%,0,1557,0.05%,0,475,0.01%,0,10457,0.31%,3.81%,3387326
3,Ark.,423932,34.78%,0,760647,62.40%,6,13133,1.08%,0,2980,0.24%,0,18377,1.51%,0,"−336,715",−27.62%,−0.70%,1219069
4,Calif.,11110639,63.48%,55,6006518,34.32%,0,187910,1.07%,0,81032,0.46%,0,115281,0.66%,0,5104121,29.16%,−0.95%,17501380
5,Colo.,1804352,55.40%,9,1364607,41.90%,0,52460,1.61%,0,8986,0.28%,0,26575,0.82%,0,439745,13.50%,8.59%,3256980
6,Conn.,1080831,59.26%,7,714717,39.19%,0,20230,1.11%,0,7538,0.41%,0,541,0.03%,0,366114,20.07%,6.43%,1823857
7,Del.,296268,58.74%,3,200603,39.77%,0,5000,0.99%,0,2139,0.42%,0,336,0.07%,0,95665,18.97%,7.60%,504346
8,D.C.,317323,92.15%,3,18586,5.40%,0,2036,0.59%,0,1726,0.50%,0,4685,1.36%,0,298737,86.75%,−0.02%,344356
9,Florida,5297045,47.86%,0,5668731,51.22%,29,70324,0.64%,0,14721,0.13%,0,16635,0.15%,0,"−371,686",−3.36%,−2.16%,11067456


In [28]:
# filter the data to only retain democratic and republican data and change the data type
pres_2020_df_dr = pd.DataFrame({'state': pres_2020_df_raw['State ordistrict']['State ordistrict'][:-2],
                                'dem_percentage': pres_2020_df_raw['Biden/HarrisDemocratic']['%'][:-2].str.strip('%').astype(float),
                                'gop_percentage': pres_2020_df_raw['Trump/PenceRepublican']['%'][:-2].str.strip('%').astype(float),
                                'margin_percentage': pres_2020_df_raw['Margin']['%'][:-2].str.strip('%').str.replace('−', '').astype(float)
                                })
pres_2020_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alab.,36.57,62.03,25.46
1,Alaska,42.77,52.83,10.06
2,Arizona,49.36,49.06,0.31
3,Ark.,34.78,62.4,27.62
4,Calif.,63.48,34.32,29.16
5,Colo.,55.4,41.9,13.5
6,Conn.,59.26,39.19,20.07
7,Del.,58.74,39.77,18.97
8,D.C.,92.15,5.4,86.75
9,Florida,47.86,51.22,3.36


In [29]:
# drop the data regarding congressional districts (ME-1, ME-2, NE-1, NE-2, NE-3)
# assume that whoever wins the most votes statewide, wins the entire state
pres_2020_df_dr.drop(pres_2020_df_dr[pres_2020_df_dr['state'].isin(['ME-1', 'ME-2', 'NE-1', 'NE-2', 'NE-3'])].index, inplace = True)
pres_2020_df_dr = pres_2020_df_dr.reset_index(drop = True)
pres_2020_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alab.,36.57,62.03,25.46
1,Alaska,42.77,52.83,10.06
2,Arizona,49.36,49.06,0.31
3,Ark.,34.78,62.4,27.62
4,Calif.,63.48,34.32,29.16
5,Colo.,55.4,41.9,13.5
6,Conn.,59.26,39.19,20.07
7,Del.,58.74,39.77,18.97
8,D.C.,92.15,5.4,86.75
9,Florida,47.86,51.22,3.36


In [30]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)
state

Unnamed: 0,0,1
0,ALABAMA,AL
1,ALASKA,AK
2,ARIZONA,AZ
3,ARKANSAS,AR
4,CALIFORNIA,CA
5,COLORADO,CO
6,CONNECTICUT,CT
7,DELAWARE,DE
8,DISTRICT OF COLUMBIA,DC
9,FLORIDA,FL


In [31]:
# standardise the state abbreviations
pres_2020_df_dr['state'] = state[1]
pres_2020_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,AL,36.57,62.03,25.46
1,AK,42.77,52.83,10.06
2,AZ,49.36,49.06,0.31
3,AR,34.78,62.4,27.62
4,CA,63.48,34.32,29.16
5,CO,55.4,41.9,13.5
6,CT,59.26,39.19,20.07
7,DE,58.74,39.77,18.97
8,DC,92.15,5.4,86.75
9,FL,47.86,51.22,3.36


In [32]:
# create the boolean column of the winner
dr = []
for i in range(len(pres_2020_df_dr)):
    if pres_2020_df_dr['dem_percentage'][i] > pres_2020_df_dr['gop_percentage'][i]:
        dr.append('D')
    if pres_2020_df_dr['gop_percentage'][i] > pres_2020_df_dr['dem_percentage'][i]:
        dr.append('R')
pres_2020_df_dr.insert(loc = 3, column = 'winner', value = dr)
pres_2020_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage
0,AL,36.57,62.03,R,25.46
1,AK,42.77,52.83,R,10.06
2,AZ,49.36,49.06,D,0.31
3,AR,34.78,62.4,R,27.62
4,CA,63.48,34.32,D,29.16
5,CO,55.4,41.9,D,13.5
6,CT,59.26,39.19,D,20.07
7,DE,58.74,39.77,D,18.97
8,DC,92.15,5.4,D,86.75
9,FL,47.86,51.22,R,3.36


In [33]:
# negative represents democratic, positive represents republican
party_margin = []
for i in range(len(pres_2020_df_dr)):
    if pres_2020_df_dr['winner'][i] == 'D':
        party_margin.append(-pres_2020_df_dr['margin_percentage'][i])
    if any(pres_2020_df_dr['winner'][i] in x for x in ['E', 'R']):
        party_margin.append(pres_2020_df_dr['margin_percentage'][i])
results_party_margin = pres_2020_df_dr
results_party_margin['2020_results_margin'] = party_margin
results_party_margin

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage,2020_results_margin
0,AL,36.57,62.03,R,25.46,25.46
1,AK,42.77,52.83,R,10.06,10.06
2,AZ,49.36,49.06,D,0.31,-0.31
3,AR,34.78,62.4,R,27.62,27.62
4,CA,63.48,34.32,D,29.16,-29.16
5,CO,55.4,41.9,D,13.5,-13.5
6,CT,59.26,39.19,D,20.07,-20.07
7,DE,58.74,39.77,D,18.97,-18.97
8,DC,92.15,5.4,D,86.75,-86.75
9,FL,47.86,51.22,R,3.36,3.36


In [34]:
# export the dataframe to a csv file
results_party_margin.to_csv('../../data/processed/2020_presidential_election.csv', index = False)