In [15]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [16]:
# scrape the web to get the dataframe
pres_2008_res = requests.get('https://en.wikipedia.org/wiki/2008_United_States_presidential_election')
pres_2008_soup = BeautifulSoup(pres_2008_res.content, 'lxml')

pres_2008_html = pres_2008_soup.find(text = 'Results by state').find_next(text = 'Results by state').find_next('table', {'class': 'wikitable sortable'})
pres_2008_df_raw = pd.DataFrame(pd.read_html(str(pres_2008_html))[0]).replace('–', 0)
pres_2008_df_raw

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Barack ObamaDemocratic,Barack ObamaDemocratic,Barack ObamaDemocratic,John McCainRepublican,John McCainRepublican,John McCainRepublican,Ralph NaderIndependent,Ralph NaderIndependent,...,Cynthia McKinneyGreen,Cynthia McKinneyGreen,Cynthia McKinneyGreen,Others,Others,Others,Margin,Margin,Total votes,Total votes
Unnamed: 0_level_1,State/district,EV,#,%,EV,#,%,EV,#,%,...,#,%,EV,#,%,EV,#,%,#,Unnamed: 26_level_1
0,Alabama,9,813479,38.74%,-,1266546,60.32%,9,6788,0.32%,...,0,0.00%,-,3705.0,0.18%,-,-453067,-21.58%,2099819,AL
1,Alaska,3,123594,37.89%,-,193841,59.42%,3,3783,1.16%,...,0,0.00%,-,1730.0,0.53%,-,-70247,-21.54%,326197,AK
2,Arizona,10,1034707,45.12%,-,1230111,53.64%,10,11301,0.49%,...,3406,0.15%,-,24.0,0.00%,-,-195404,-8.52%,2293475,AZ
3,Arkansas,6,422310,38.86%,-,638017,58.72%,6,12882,1.19%,...,3470,0.32%,-,1139.0,0.10%,-,-215707,-19.85%,1086617,AR
4,California,55,8274473,61.01%,55,5011781,36.95%,-,108381,0.80%,...,38774,0.29%,-,57764.0,0.43%,-,3262692,24.06%,13561900,CA
5,Colorado,9,1288633,53.66%,9,1073629,44.71%,-,13352,0.56%,...,2822,0.12%,-,5895.0,0.25%,-,215004,8.95%,2401462,CO
6,Connecticut,7,997772,60.59%,7,629428,38.22%,-,19162,1.16%,...,90,0.01%,-,34.0,0.00%,-,368344,22.37%,1646797,CT
7,Delaware,3,255459,61.94%,3,152374,36.95%,-,2401,0.58%,...,385,0.09%,-,58.0,0.01%,-,103085,25.00%,412412,DE
8,District of Columbia,3,245800,92.46%,3,17367,6.53%,-,958,0.36%,...,590,0.22%,-,1138.0,0.43%,-,228433,85.92%,265853,DC
9,Florida,27,4282074,51.03%,27,4045624,48.22%,-,28124,0.34%,...,2887,0.03%,-,6902.0,0.08%,-,236450,2.82%,8390744,FL


In [17]:
# filter the data to only retain democratic and republican data and change the data type
pres_2008_df_dr = pd.DataFrame({'state': pres_2008_df_raw['Unnamed: 0_level_0']['State/district'][:-1],
                                'dem_percentage': pres_2008_df_raw['Barack ObamaDemocratic']['%'][:-1].str.strip('%').astype(float),
                                'gop_percentage': pres_2008_df_raw['John McCainRepublican']['%'][:-1].str.strip('%').astype(float),
                                'margin_percentage': abs(pres_2008_df_raw['Margin']['%'][:-1].str.strip('%').str.replace('−', '').astype(float))
                                })
pres_2008_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alabama,38.74,60.32,21.58
1,Alaska,37.89,59.42,21.54
2,Arizona,45.12,53.64,8.52
3,Arkansas,38.86,58.72,19.85
4,California,61.01,36.95,24.06
5,Colorado,53.66,44.71,8.95
6,Connecticut,60.59,38.22,22.37
7,Delaware,61.94,36.95,25.0
8,District of Columbia,92.46,6.53,85.92
9,Florida,51.03,48.22,2.82


In [18]:
# drop the data regarding congressional districts (ME-1, ME-2, NE-1, NE-2, NE-3)
# assume that whoever wins the most votes statewide, wins the entire state
pres_2008_df_dr.drop(pres_2008_df_dr[pres_2008_df_dr['state'].isin(['Maine\'s 1st', 'Maine\'s 2nd', 'Nebraska\'s 1st', 'Nebraska\'s 2nd', 'Nebraska\'s 3rd'])].index, inplace = True)
pres_2008_df_dr = pres_2008_df_dr.reset_index(drop = True)
pres_2008_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alabama,38.74,60.32,21.58
1,Alaska,37.89,59.42,21.54
2,Arizona,45.12,53.64,8.52
3,Arkansas,38.86,58.72,19.85
4,California,61.01,36.95,24.06
5,Colorado,53.66,44.71,8.95
6,Connecticut,60.59,38.22,22.37
7,Delaware,61.94,36.95,25.0
8,District of Columbia,92.46,6.53,85.92
9,Florida,51.03,48.22,2.82


In [19]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)[1]
state

0     AL
1     AK
2     AZ
3     AR
4     CA
5     CO
6     CT
7     DE
8     DC
9     FL
10    GA
11    HI
12    ID
13    IL
14    IN
15    IA
16    KS
17    KY
18    LA
19    ME
20    MD
21    MA
22    MI
23    MN
24    MS
25    MO
26    MT
27    NE
28    NV
29    NH
30    NJ
31    NM
32    NY
33    NC
34    ND
35    OH
36    OK
37    OR
38    PA
39    RI
40    SC
41    SD
42    TN
43    TX
44    UT
45    VT
46    VA
47    WA
48    WV
49    WI
50    WY
Name: 1, dtype: object

In [20]:
# standardise the state abbreviations
pres_2008_df_dr['state'] = state
pres_2008_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,AL,38.74,60.32,21.58
1,AK,37.89,59.42,21.54
2,AZ,45.12,53.64,8.52
3,AR,38.86,58.72,19.85
4,CA,61.01,36.95,24.06
5,CO,53.66,44.71,8.95
6,CT,60.59,38.22,22.37
7,DE,61.94,36.95,25.0
8,DC,92.46,6.53,85.92
9,FL,51.03,48.22,2.82


In [21]:
# create the boolean column of the winner
dr = []
for i in range(len(pres_2008_df_dr)):
    if pres_2008_df_dr['dem_percentage'][i] > pres_2008_df_dr['gop_percentage'][i]:
        dr.append('D')
    if pres_2008_df_dr['gop_percentage'][i] > pres_2008_df_dr['dem_percentage'][i]:
        dr.append('R')
pres_2008_df_dr.insert(loc = 3, column = 'winner', value = dr)
pres_2008_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage
0,AL,38.74,60.32,R,21.58
1,AK,37.89,59.42,R,21.54
2,AZ,45.12,53.64,R,8.52
3,AR,38.86,58.72,R,19.85
4,CA,61.01,36.95,D,24.06
5,CO,53.66,44.71,D,8.95
6,CT,60.59,38.22,D,22.37
7,DE,61.94,36.95,D,25.0
8,DC,92.46,6.53,D,85.92
9,FL,51.03,48.22,D,2.82


In [22]:
# negative represents democratic, positive represents republican
party_margin = []
for i in range(len(pres_2008_df_dr)):
    if pres_2008_df_dr['winner'][i] == 'D':
        party_margin.append(-pres_2008_df_dr['margin_percentage'][i])
    if any(pres_2008_df_dr['winner'][i] in x for x in ['E', 'R']):
        party_margin.append(pres_2008_df_dr['margin_percentage'][i])
results_party_margin = pres_2008_df_dr
results_party_margin['2008_results_margin'] = party_margin
results_party_margin

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage,2008_results_margin
0,AL,38.74,60.32,R,21.58,21.58
1,AK,37.89,59.42,R,21.54,21.54
2,AZ,45.12,53.64,R,8.52,8.52
3,AR,38.86,58.72,R,19.85,19.85
4,CA,61.01,36.95,D,24.06,-24.06
5,CO,53.66,44.71,D,8.95,-8.95
6,CT,60.59,38.22,D,22.37,-22.37
7,DE,61.94,36.95,D,25.0,-25.0
8,DC,92.46,6.53,D,85.92,-85.92
9,FL,51.03,48.22,D,2.82,-2.82


In [23]:
# export the dataframe to a csv file
results_party_margin.to_csv('../../data/processed/2008_presidential_election.csv', index = False)