In [10]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [11]:
# scrape the web to get the dataframe
pres_2016_res = requests.get('https://en.wikipedia.org/wiki/2016_United_States_presidential_election#Results_by_state')
pres_2016_soup = BeautifulSoup(pres_2016_res.content, 'lxml')

pres_2016_html = pres_2016_soup.find(text = 'Results by state').find_next(text = 'Results by state').find_next('table', {'class': 'wikitable sortable'})
pres_2016_df_raw = pd.DataFrame(pd.read_html(str(pres_2016_html))[0]).replace('–', 0)
pres_2016_df_raw

Unnamed: 0_level_0,State ordistrict,Hillary ClintonDemocratic,Hillary ClintonDemocratic,Hillary ClintonDemocratic,Donald TrumpRepublican,Donald TrumpRepublican,Donald TrumpRepublican,Gary JohnsonLibertarian,Gary JohnsonLibertarian,Gary JohnsonLibertarian,...,Evan McMullinIndependent,Evan McMullinIndependent,Evan McMullinIndependent,Others,Others,Others,Margin,Margin,Totalvotes,Sources
Unnamed: 0_level_1,State ordistrict,Votes,%,EV,Votes,%,EV,Votes,%,EV,...,Votes,%,EV,Votes,%,EV,Votes,%,Totalvotes,Sources
0,Ala.,729547,34.36%,0,1318255,62.08%,9,44467,2.09%,0,...,0,0,0,21712,1.02%,0,588708,27.73%,2123372,[394]
1,Alaska,116454,36.55%,0,163387,51.28%,3,18725,5.88%,0,...,0,0,0,14307,4.49%,0,46933,14.73%,318608,[395]
2,Ariz.,1161167,44.58%,0,1252401,48.08%,11,106327,4.08%,0,...,17449,0.67%,0,32968,1.27%,0,91234,3.50%,2604657,[396]
3,Ark.,380494,33.65%,0,684872,60.57%,6,29949,2.64%,0,...,13176,1.17%,0,12712,1.12%,0,304378,26.92%,1130676,[397]
4,Calif.,8753788,61.73%,55,4483810,31.62%,0,478500,3.37%,0,...,39596,0.28%,0,147244,1.04%,0,"−4,269,978",−30.11%,14181595,[398]
5,Colo.,1338870,48.16%,9,1202484,43.25%,0,144121,5.18%,0,...,28917,1.04%,0,27418,0.99%,0,"−136,386",−4.91%,2780247,[399]
6,Conn.,897572,54.57%,7,673215,40.93%,0,48676,2.96%,0,...,2108,0.13%,0,508,0.03%,0,"−224,357",−13.64%,1644920,[400]
7,Del.,235603,53.09%,3,185127,41.72%,0,14757,3.32%,0,...,706,0.16%,0,1518,0.34%,0,"−50,476",−11.37%,443814,[401][402]
8,D.C.,282830,90.86%,3,12723,4.09%,0,4906,1.57%,0,...,0,0,0,6551,2.52%,0,"−270,107",−86.77%,311268,[403]
9,Fla.,4504975,47.82%,0,4617886,49.02%,29,207043,2.20%,0,...,0,0,0,25736,0.28%,0,112911,1.20%,9420039,[404]


In [12]:
# filter the data to only retain democratic and republican data and change the data type
pres_2016_df_dr = pd.DataFrame({'state': pres_2016_df_raw['State ordistrict']['State ordistrict'][:-2],
                                'dem_percentage': pres_2016_df_raw['Hillary ClintonDemocratic']['%'][:-2].str.strip('%').astype(float),
                                'gop_percentage': pres_2016_df_raw['Donald TrumpRepublican']['%'][:-2].str.strip('%').astype(float),
                                'margin_percentage': pres_2016_df_raw['Margin']['%'][:-2].str.strip('%').str.replace('−', '').astype(float)
                                })
pres_2016_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Ala.,34.36,62.08,27.73
1,Alaska,36.55,51.28,14.73
2,Ariz.,44.58,48.08,3.5
3,Ark.,33.65,60.57,26.92
4,Calif.,61.73,31.62,30.11
5,Colo.,48.16,43.25,4.91
6,Conn.,54.57,40.93,13.64
7,Del.,53.09,41.72,11.37
8,D.C.,90.86,4.09,86.77
9,Fla.,47.82,49.02,1.2


In [13]:
# drop the data regarding congressional districts (ME-1, ME-2, NE-1, NE-2, NE-3)
# assume that whoever wins the most votes statewide, wins the entire state
pres_2016_df_dr.drop(pres_2016_df_dr[pres_2016_df_dr['state'].isin(['ME-1', 'ME-2', 'NE-1', 'NE-2', 'NE-3'])].index, inplace = True)
pres_2016_df_dr = pres_2016_df_dr.reset_index(drop = True)
pres_2016_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Ala.,34.36,62.08,27.73
1,Alaska,36.55,51.28,14.73
2,Ariz.,44.58,48.08,3.5
3,Ark.,33.65,60.57,26.92
4,Calif.,61.73,31.62,30.11
5,Colo.,48.16,43.25,4.91
6,Conn.,54.57,40.93,13.64
7,Del.,53.09,41.72,11.37
8,D.C.,90.86,4.09,86.77
9,Fla.,47.82,49.02,1.2


In [14]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)
state

Unnamed: 0,0,1
0,ALABAMA,AL
1,ALASKA,AK
2,ARIZONA,AZ
3,ARKANSAS,AR
4,CALIFORNIA,CA
5,COLORADO,CO
6,CONNECTICUT,CT
7,DELAWARE,DE
8,DISTRICT OF COLUMBIA,DC
9,FLORIDA,FL


In [15]:
# standardise the state abbreviations
pres_2016_df_dr['state'] = state[1]
pres_2016_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,AL,34.36,62.08,27.73
1,AK,36.55,51.28,14.73
2,AZ,44.58,48.08,3.5
3,AR,33.65,60.57,26.92
4,CA,61.73,31.62,30.11
5,CO,48.16,43.25,4.91
6,CT,54.57,40.93,13.64
7,DE,53.09,41.72,11.37
8,DC,90.86,4.09,86.77
9,FL,47.82,49.02,1.2


In [16]:
# create the boolean column of the winner
dr = []
for i in range(len(pres_2016_df_dr)):
    if pres_2016_df_dr['dem_percentage'][i] > pres_2016_df_dr['gop_percentage'][i]:
        dr.append('D')
    if pres_2016_df_dr['gop_percentage'][i] > pres_2016_df_dr['dem_percentage'][i]:
        dr.append('R')
pres_2016_df_dr.insert(loc = 3, column = 'winner', value = dr)
pres_2016_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage
0,AL,34.36,62.08,R,27.73
1,AK,36.55,51.28,R,14.73
2,AZ,44.58,48.08,R,3.5
3,AR,33.65,60.57,R,26.92
4,CA,61.73,31.62,D,30.11
5,CO,48.16,43.25,D,4.91
6,CT,54.57,40.93,D,13.64
7,DE,53.09,41.72,D,11.37
8,DC,90.86,4.09,D,86.77
9,FL,47.82,49.02,R,1.2


In [17]:
# negative represents democratic, positive represents republican
party_margin = []
for i in range(len(pres_2016_df_dr)):
    if pres_2016_df_dr['winner'][i] == 'D':
        party_margin.append(-pres_2016_df_dr['margin_percentage'][i])
    if any(pres_2016_df_dr['winner'][i] in x for x in ['E', 'R']):
        party_margin.append(pres_2016_df_dr['margin_percentage'][i])
results_party_margin = pres_2016_df_dr
results_party_margin['2016_results_margin'] = party_margin
results_party_margin

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage,2016_results_margin
0,AL,34.36,62.08,R,27.73,27.73
1,AK,36.55,51.28,R,14.73,14.73
2,AZ,44.58,48.08,R,3.5,3.5
3,AR,33.65,60.57,R,26.92,26.92
4,CA,61.73,31.62,D,30.11,-30.11
5,CO,48.16,43.25,D,4.91,-4.91
6,CT,54.57,40.93,D,13.64,-13.64
7,DE,53.09,41.72,D,11.37,-11.37
8,DC,90.86,4.09,D,86.77,-86.77
9,FL,47.82,49.02,R,1.2,1.2


In [18]:
# export the dataframe to a csv file
results_party_margin.to_csv('../../data/processed/2016_presidential_election.csv', index = False)