In [1]:
# read in beautiful soup and pandas
from bs4 import BeautifulSoup
import requests
import pandas as pd
import regex as re

In [2]:
# assign url for main politico page
url = "https://www.politico.com/2020-election/results/house/"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://www.politico.com/apple-touch-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>
  <link href="https://www.politico.com/apple-touch-icon-72x72.png" rel="apple-touch-icon" sizes="72x72"/>
  <link href="https://www.politico.com/apple-touch-icon-76x76.png" rel="apple-touch-icon" sizes="76x76"/>
  <link href="https://www.politico.com/apple-touch-icon-114x114.png" rel="apple-touch-icon" sizes="114x114"/>
  <link href="https://www.politico.com/apple-touch-icon-120x120.png" rel="apple-touch-icon" sizes="120x120"/>
  <link href="https://www.politico.com/apple-touch-icon-144x144.png" rel="apple-touch-icon" sizes="144x144"/>
  <link href="https://www.politico.com/apple-touch-icon-152x152.png" rel="apple-touch-icon" sizes="152x152"/>
  <link href="https://www.politico.com/apple-touch-icon-180x180.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="https://www.politico.com/favicon-32x32.png" rel="icon" sizes="32x32" type="im

In [3]:
# retrieve all state links in a list
state_links = soup.find('ul','jsx-3088201999 state-link-list').find_all("p")
# drop the first element since that's "all states"
state_links = state_links[1:len(state_links)]


In [4]:
# extract the urls from state_links
state_urls = [x.find("a").get('href') for x in state_links]
# add in "politico.com" to each url
state_urls = ["http://politico.com"+x+'house' for x in state_urls]

In [5]:
state_urls

['http://politico.com/2020-election/results/alabama/house',
 'http://politico.com/2020-election/results/alaska/house',
 'http://politico.com/2020-election/results/arizona/house',
 'http://politico.com/2020-election/results/arkansas/house',
 'http://politico.com/2020-election/results/california/house',
 'http://politico.com/2020-election/results/colorado/house',
 'http://politico.com/2020-election/results/connecticut/house',
 'http://politico.com/2020-election/results/delaware/house',
 'http://politico.com/2020-election/results/florida/house',
 'http://politico.com/2020-election/results/georgia/house',
 'http://politico.com/2020-election/results/hawaii/house',
 'http://politico.com/2020-election/results/idaho/house',
 'http://politico.com/2020-election/results/illinois/house',
 'http://politico.com/2020-election/results/indiana/house',
 'http://politico.com/2020-election/results/iowa/house',
 'http://politico.com/2020-election/results/kansas/house',
 'http://politico.com/2020-election/r

In [8]:
state_url = state_urls[0]
state = re.sub('/house','',re.sub('.*results/','',state_url))
# retrieve state
req = requests.get(state_url)
soup = BeautifulSoup(req.content, 'html.parser')
# create a list of all result blocks
result_blocks = soup.find_all('div','smaller-leaderboard-container')

In [31]:
####################################################################################################################
column_names = ["state","district","candidate","party","votes"]
district = result_blocks[0].find('div').get('id')
result_table = result_blocks[0].find('div','results-table').find('table').find('tbody').find_all('tr')

# define a function to parse rows
def parse_row(row,state,district):
    # first td contains candidate name and party, so extract that
    name_party = row.find_all('td')[0].find_all('div')
    name = name_party[0].get_text()
    party = name_party[1].get_text()
    # text of second td contains votes
    votes = row.find('div','candidate-votes-next-to-percent').get_text()
    return([state,district,name,party,votes])
table_data_list = []
for row in result_table:
    table_data_list.append(parse_row(row,state,district))
pd.DataFrame(table_data_list,columns = column_names)

Unnamed: 0,state,district,candidate,party,votes
0,alabama,cd01,Carl,gop,210636
1,alabama,cd01,Averhart,dem,115592


In [33]:
# define a function that takes a state_url as an argument, and returns a dataframe of all election results for that state
def get_state_results(state_url):
    # store name of state
    state = re.sub('/house','',re.sub('.*results/','',state_url))
    # retrieve state
    req = requests.get(state_url)
    if req.status_code!=404:
        soup = BeautifulSoup(req.content, 'html.parser')
        # create a list of all result blocks
        result_blocks = soup.find_all('div','smaller-leaderboard-container')
        # create a blank list to store results for each district in the state
        all_districts = []
        for result_block in result_blocks:
            # store the name of district
            district = result_block.find('div').get('id')
            # isolate table body, parse to list of rows
            result_table = result_block.find('div','results-table').find('table').find('tbody').find_all('tr')
            # iterate through rows of result_table, use parse_row to extract info
            table_row_list = [parse_row(row,state,district=district) for row in result_table]
            # convert table_row_list to pandas
            temp_df = pd.DataFrame(table_row_list,columns = column_names)
            # add pandas frame to all_districts
            all_districts.append(temp_df)
        return(pd.concat(all_districts))
get_state_results(state_url)


Unnamed: 0,state,district,candidate,party,votes
0,alabama,cd01,Carl,gop,210636
1,alabama,cd01,Averhart,dem,115592
0,alabama,cd02,Moore,gop,197329
1,alabama,cd02,Harvey-Hall,dem,104592
0,alabama,cd03,Rogers*,gop,216700
1,alabama,cd03,Winfrey,dem,103874
0,alabama,cd04,Aderholt*,gop,257201
1,alabama,cd04,Neighbors,dem,52688
0,alabama,cd05,Brooks*,gop,0
0,alabama,cd06,Palmer*,gop,0


In [36]:
# apply get_state_result for all states
all_state_results = [get_state_results(state_url) for state_url in state_urls]

In [42]:
all_cd_results = pd.concat(all_state_results).reset_index()
all_cd_results[['state','district','candidate','party','votes']].to_csv('results by cd 2020.csv',
                                                                       index=False)

In [25]:
state_url

'http://politico.com/2020-election/results/alabama/house'

In [24]:
requests.get('https://www.politico.com/2020-election/results/washington-dc/house').status_code

404

In [41]:
all_cd_results

Unnamed: 0,index,state,district,candidate,party,votes
0,0,alabama,cd01,Carl,gop,210636
1,1,alabama,cd01,Averhart,dem,115592
2,0,alabama,cd02,Moore,gop,197329
3,1,alabama,cd02,Harvey-Hall,dem,104592
4,0,alabama,cd03,Rogers*,gop,216700
...,...,...,...,...,...,...
853,1,wisconsin,cd07,Zunker,dem,162729
854,0,wisconsin,cd08,Gallagher*,gop,267862
855,1,wisconsin,cd08,Stuck,dem,150824
856,0,wyoming,cd00,Cheney*,gop,185602
