In [19]:
# import libraries
import pandas as pd
import requests
from pathlib import Path
from bs4 import BeautifulSoup

# assign the path for storing data
DATA_DIR   = "../data/raw/"

In [6]:
# read data
house_2018_raw = pd.read_csv('../data/raw/2018_house_raw.csv')
house_2018_raw

Unnamed: 0,Dist,2018 Winner,Dem Votes,GOP Votes,Other Votes,Dem %,GOP %,Other %,Dem Margin,2016 Clinton Margin,Swing vs. 2016 Prez,Raw Votes vs. 2016
0,AL-01,Bradley Byrne (R),89226,153228,163,36.8,63.2,0.1,-26.4,-29.2,2.8,79.3
1,AL-02,Martha Roby (R),86931,138879,420,38.4,61.4,0.2,-23.0,-31.7,8.7,78.7
2,AL-03,Mike Rogers (R),83996,147770,149,36.2,63.7,0.1,-27.5,-33.0,5.5,79.6
3,AL-04,Robert Aderholt (R),46492,184255,222,20.1,79.8,0.1,-59.6,-62.5,2.9,78.9
4,AL-05,Mo Brooks (R),101388,159063,222,38.9,61.0,0.1,-22.1,-32.9,10.8,82.8
...,...,...,...,...,...,...,...,...,...,...,...,...
430,WI-05,Jim Sensenbrenner (R),138385,225619,1,38.0,62.0,0.0,-24.0,-19.8,-4.2,89.7
431,WI-06,Glenn Grothman (R),144536,180311,0,44.5,55.5,0.0,-11.0,-16.6,5.6,87.5
432,WI-07,Sean Duffy (R),124307,194061,4419,38.5,60.1,1.4,-21.6,-20.3,-1.3,86.5
433,WI-08,Mike Gallagher (R),119265,209410,0,36.3,63.7,0.0,-27.4,-17.4,-10,88.3


In [20]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)
state = state.rename(columns={0:"full_name", 1:"abbrevation"})
state = state.set_index("full_name")

state.to_csv(f"{DATA_DIR}state_abbreviation.csv")
state

Unnamed: 0,0,1
0,ALABAMA,AL
1,ALASKA,AK
2,ARIZONA,AZ
3,ARKANSAS,AR
4,CALIFORNIA,CA
5,COLORADO,CO
6,CONNECTICUT,CT
7,DELAWARE,DE
8,DISTRICT OF COLUMBIA,DC
9,FLORIDA,FL


In [22]:
# split the state from the district number
house_2018_raw_state = house_2018_raw
house_2018_raw_state['state'] = house_2018_raw['Dist'].str.split('-', expand = True)[0]
house_2018_raw_state

Unnamed: 0,Dist,2018 Winner,Dem Votes,GOP Votes,Other Votes,Dem %,GOP %,Other %,Dem Margin,2016 Clinton Margin,Swing vs. 2016 Prez,Raw Votes vs. 2016,state
0,AL-01,Bradley Byrne (R),89226,153228,163,36.8,63.2,0.1,-26.4,-29.2,2.8,79.3,AL
1,AL-02,Martha Roby (R),86931,138879,420,38.4,61.4,0.2,-23.0,-31.7,8.7,78.7,AL
2,AL-03,Mike Rogers (R),83996,147770,149,36.2,63.7,0.1,-27.5,-33.0,5.5,79.6,AL
3,AL-04,Robert Aderholt (R),46492,184255,222,20.1,79.8,0.1,-59.6,-62.5,2.9,78.9,AL
4,AL-05,Mo Brooks (R),101388,159063,222,38.9,61.0,0.1,-22.1,-32.9,10.8,82.8,AL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,WI-05,Jim Sensenbrenner (R),138385,225619,1,38.0,62.0,0.0,-24.0,-19.8,-4.2,89.7,WI
431,WI-06,Glenn Grothman (R),144536,180311,0,44.5,55.5,0.0,-11.0,-16.6,5.6,87.5,WI
432,WI-07,Sean Duffy (R),124307,194061,4419,38.5,60.1,1.4,-21.6,-20.3,-1.3,86.5,WI
433,WI-08,Mike Gallagher (R),119265,209410,0,36.3,63.7,0.0,-27.4,-17.4,-10,88.3,WI


In [88]:
# sum the votes for each state
house_2018_without_dc = house_2018_raw_state.groupby('state')[['Dem Votes', 'GOP Votes', 'Other Votes']].sum().reindex(state[1]).reset_index().rename(columns = {1: 'state', 'Dem Votes': 'dem_votes', 'GOP Votes': 'gop_votes', 'Other Votes' : 'oth_votes'})
house_2018_without_dc

Unnamed: 0,state,dem_votes,gop_votes,oth_votes
0,AL,678687.0,975737.0,5471.0
1,AK,131199.0,149779.0,1188.0
2,AZ,1179193.0,1139552.0,22525.0
3,AR,312978.0,556339.0,19981.0
4,CA,8010445.0,3973396.0,200681.0
5,CO,1343211.0,1079772.0,90924.0
6,CT,849341.0,520521.0,9946.0
7,DE,227353.0,125384.0,0.0
8,DC,,,
9,FL,3307228.0,3675417.0,38831.0


In [89]:
# find the 2018 election results for DC
dc_2018_res = requests.get('https://en.wikipedia.org/wiki/2018_United_States_House_of_Representatives_election_in_District_of_Columbia')
dc_2018_soup = BeautifulSoup(dc_2018_res.content, 'lxml')
dc_2018_html = dc_2018_soup.find(text = 'Results').find_next(text = 'Results').find_next('table')
dc_2018_raw = pd.DataFrame(pd.read_html(str(dc_2018_html))[0]).loc[:5, ['Party.1', 'Votes']]
dc_2018_raw

Unnamed: 0,Party.1,Votes
0,Democratic,199124
1,Republican,9700
2,DC Statehood Green,8636
3,Independent,5509
4,Libertarian,4034
5,,1766


In [90]:
# sum the other votes
dc_2018_other_votes = dc_2018_raw[~dc_2018_raw['Party.1'].isin(['Democratic', 'Republican'])]['Votes'].astype(int).sum()
dc_2018_other_votes

19945

In [95]:
# create the dataframe for the popular votes
house_2018_with_dc = house_2018_without_dc
house_2018_with_dc.loc[house_2018_with_dc[house_2018_with_dc['state'] == 'DC'].index, 'dem_votes'] = dc_2018_raw.loc[0, 'Votes']
house_2018_with_dc.loc[house_2018_with_dc[house_2018_with_dc['state'] == 'DC'].index, 'gop_votes'] = dc_2018_raw.loc[1, 'Votes']
house_2018_with_dc.loc[house_2018_with_dc[house_2018_with_dc['state'] == 'DC'].index, 'oth_votes'] = dc_2018_other_votes
house_2018_with_dc = house_2018_with_dc.astype({'state': 'object', 'dem_votes': 'int', 'gop_votes': 'int', 'oth_votes': 'int'})
house_2018_with_dc['total_votes'] = house_2018_with_dc['dem_votes'] + house_2018_with_dc['gop_votes'] + house_2018_with_dc['oth_votes']
house_2018_with_dc

Unnamed: 0,state,dem_votes,gop_votes,oth_votes,total_votes
0,AL,678687,975737,5471,1659895
1,AK,131199,149779,1188,282166
2,AZ,1179193,1139552,22525,2341270
3,AR,312978,556339,19981,889298
4,CA,8010445,3973396,200681,12184522
5,CO,1343211,1079772,90924,2513907
6,CT,849341,520521,9946,1379808
7,DE,227353,125384,0,352737
8,DC,199124,9700,19945,228769
9,FL,3307228,3675417,38831,7021476


In [98]:
# create the dataframe for the percentage of votes
house_2018_df = pd.DataFrame(house_2018_with_dc['state'])
house_2018_df['dem_percentage'] = house_2018_with_dc['dem_votes'] / house_2018_with_dc['total_votes']
house_2018_df['gop_percentage'] = house_2018_with_dc['gop_votes'] / house_2018_with_dc['total_votes']
house_2018_df

Unnamed: 0,state,dem_percentage,gop_percentage
0,AL,0.408873,0.587831
1,AK,0.464971,0.530819
2,AZ,0.503655,0.486724
3,AR,0.351938,0.625593
4,CA,0.657428,0.326102
5,CO,0.534312,0.429519
6,CT,0.61555,0.377242
7,DE,0.64454,0.35546
8,DC,0.870415,0.042401
9,FL,0.471016,0.523454


In [101]:
# add the columns for the party with the highest percentage for each state and the corresponding margin percentage
dr = []
for i in range(len(house_2018_df)):
    if house_2018_df['dem_percentage'][i] > house_2018_df['gop_percentage'][i]:
        dr.append('D')
    if house_2018_df['gop_percentage'][i] > house_2018_df['dem_percentage'][i]:
        dr.append('R')
house_2018_df['winner'] = dr
house_2018_df['margin_percentage'] = abs(house_2018_df['dem_percentage'] - house_2018_df['gop_percentage'])
house_2018_df

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage
0,AL,0.408873,0.587831,R,0.178957
1,AK,0.464971,0.530819,R,0.065848
2,AZ,0.503655,0.486724,D,0.016931
3,AR,0.351938,0.625593,R,0.273655
4,CA,0.657428,0.326102,D,0.331326
5,CO,0.534312,0.429519,D,0.104793
6,CT,0.61555,0.377242,D,0.238309
7,DE,0.64454,0.35546,D,0.289079
8,DC,0.870415,0.042401,D,0.828014
9,FL,0.471016,0.523454,R,0.052438


In [102]:
# export the dataframe to a csv file
house_2018_df.to_csv('../data/processed/2018_house_election.csv', index = False)