In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# scrape the web to get the dataframe
pres_2004_res = requests.get('https://en.wikipedia.org/wiki/2004_United_States_presidential_election')
pres_2004_soup = BeautifulSoup(pres_2004_res.content, 'lxml')

pres_2004_html = pres_2004_soup.find(text = 'Results by state').find_next(text = 'Results by state').find_next('table', {'class': 'wikitable sortable'})
pres_2004_df_raw = pd.DataFrame(pd.read_html(str(pres_2004_html))[0]).replace('–', 0)
pres_2004_df_raw

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,George W. BushRepublican,George W. BushRepublican,George W. BushRepublican,John KerryDemocratic,John KerryDemocratic,John KerryDemocratic,Ralph NaderIndependent / Reform,Ralph NaderIndependent / Reform,...,David CobbGreen,David CobbGreen,David CobbGreen,Others,Others,Others,Margin,Margin,State Total,State Total
Unnamed: 0_level_1,State,EV,#,%,EV,#,%,EV,#,%,...,#,%,EV,#,%,EV,#,%,#,Unnamed: 26_level_1
0,Alabama,9,1176394,62.46%,9,693933,36.84%,0,6701,0.36%,...,0,0.00%,0,898,0.05%,0,482461,25.62%,1883449,AL
1,Alaska,3,190889,61.07%,3,111025,35.52%,0,5069,1.62%,...,1058,0.34%,0,790,0.25%,0,79864,25.55%,312598,AK
2,Arizona,10,1104294,54.87%,10,893524,44.40%,0,2773,0.14%,...,138,0.01%,0,0,0.00%,0,210770,10.47%,2012585,AZ
3,Arkansas,6,572898,54.31%,6,469953,44.55%,0,6171,0.58%,...,1488,0.14%,0,0,0.00%,0,102945,9.76%,1054945,AR
4,California,55,5509826,44.36%,0,6745485,54.31%,55,20714,0.17%,...,40771,0.33%,0,27747,0.22%,0,"−1,235,659",−9.95%,12421353,CA
5,Colorado,9,1101255,51.69%,9,1001732,47.02%,0,12718,0.60%,...,1591,0.07%,0,2808,0.13%,0,99523,4.67%,2130330,CO
6,Connecticut,7,693826,43.95%,0,857488,54.31%,7,12969,0.82%,...,9564,0.61%,0,12,0.00%,0,"−163,662",−10.37%,1578769,CT
7,Delaware,3,171660,45.75%,0,200152,53.35%,3,2153,0.57%,...,250,0.07%,0,100,0.03%,0,"−28,492",−7.59%,375190,DE
8,District of Columbia,3,21256,9.34%,0,202970,89.18%,3,1485,0.65%,...,737,0.32%,0,636,0.28%,0,"−181,714",−79.84%,227586,DC
9,Florida,27,3964522,52.10%,27,3583544,47.09%,0,32971,0.43%,...,3917,0.05%,0,6234,0.08%,0,380978,5.01%,7609810,FL


In [6]:
# filter the data to only retain democratic and republican data and change the data type
pres_2004_df_dr = pd.DataFrame({'state': pres_2004_df_raw['Unnamed: 0_level_0']['State'][:-1],
                                'dem_percentage': pres_2004_df_raw['John KerryDemocratic']['%'][:-1].str.strip('%').astype(float),
                                'gop_percentage': pres_2004_df_raw['George W. BushRepublican']['%'][:-1].str.strip('%').astype(float),
                                'margin_percentage': pres_2004_df_raw['Margin']['%'][:-1].str.strip('%').str.replace('−', '').astype(float)
                                })
pres_2004_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alabama,36.84,62.46,25.62
1,Alaska,35.52,61.07,25.55
2,Arizona,44.4,54.87,10.47
3,Arkansas,44.55,54.31,9.76
4,California,54.31,44.36,9.95
5,Colorado,47.02,51.69,4.67
6,Connecticut,54.31,43.95,10.37
7,Delaware,53.35,45.75,7.59
8,District of Columbia,89.18,9.34,79.84
9,Florida,47.09,52.1,5.01


In [7]:
# drop the data regarding congressional districts (ME-1, ME-2, NE-1, NE-2, NE-3)
# assume that whoever wins the most votes statewide, wins the entire state
pres_2004_df_dr.drop(pres_2004_df_dr[pres_2004_df_dr['state'].isin(['Maine-1', 'Maine-2', 'Nebraska-1', 'Nebraska-2', 'Nebraska-3'])].index, inplace = True)
pres_2004_df_dr = pres_2004_df_dr.reset_index(drop = True)
pres_2004_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,Alabama,36.84,62.46,25.62
1,Alaska,35.52,61.07,25.55
2,Arizona,44.4,54.87,10.47
3,Arkansas,44.55,54.31,9.76
4,California,54.31,44.36,9.95
5,Colorado,47.02,51.69,4.67
6,Connecticut,54.31,43.95,10.37
7,Delaware,53.35,45.75,7.59
8,District of Columbia,89.18,9.34,79.84
9,Florida,47.09,52.1,5.01


In [8]:
# find the state abbreviations
state_res = requests.get('https://www.ssa.gov/international/coc-docs/states.html')
state_soup = BeautifulSoup(state_res.content, 'lxml')
state_html = state_soup.find(text = 'Two-Letter State Abbreviations').find_next('table')
state = pd.DataFrame(pd.read_html(str(state_html))[0])
state.drop(state[state[0].isin(['AMERICAN SAMOA', 'GUAM', 'NORTHERN MARIANA IS', 'PUERTO RICO', 'VIRGIN ISLANDS'])].index, inplace = True)
state = state.reset_index(drop = True)[1]
state

0     AL
1     AK
2     AZ
3     AR
4     CA
5     CO
6     CT
7     DE
8     DC
9     FL
10    GA
11    HI
12    ID
13    IL
14    IN
15    IA
16    KS
17    KY
18    LA
19    ME
20    MD
21    MA
22    MI
23    MN
24    MS
25    MO
26    MT
27    NE
28    NV
29    NH
30    NJ
31    NM
32    NY
33    NC
34    ND
35    OH
36    OK
37    OR
38    PA
39    RI
40    SC
41    SD
42    TN
43    TX
44    UT
45    VT
46    VA
47    WA
48    WV
49    WI
50    WY
Name: 1, dtype: object

In [9]:
# standardise the state abbreviations
pres_2004_df_dr['state'] = state
pres_2004_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,margin_percentage
0,AL,36.84,62.46,25.62
1,AK,35.52,61.07,25.55
2,AZ,44.4,54.87,10.47
3,AR,44.55,54.31,9.76
4,CA,54.31,44.36,9.95
5,CO,47.02,51.69,4.67
6,CT,54.31,43.95,10.37
7,DE,53.35,45.75,7.59
8,DC,89.18,9.34,79.84
9,FL,47.09,52.1,5.01


In [10]:
# create the boolean column of the winner
dr = []
for i in range(len(pres_2004_df_dr)):
    if pres_2004_df_dr['dem_percentage'][i] > pres_2004_df_dr['gop_percentage'][i]:
        dr.append('D')
    if pres_2004_df_dr['gop_percentage'][i] > pres_2004_df_dr['dem_percentage'][i]:
        dr.append('R')
pres_2004_df_dr.insert(loc = 3, column = 'winner', value = dr)
pres_2004_df_dr

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage
0,AL,36.84,62.46,R,25.62
1,AK,35.52,61.07,R,25.55
2,AZ,44.4,54.87,R,10.47
3,AR,44.55,54.31,R,9.76
4,CA,54.31,44.36,D,9.95
5,CO,47.02,51.69,R,4.67
6,CT,54.31,43.95,D,10.37
7,DE,53.35,45.75,D,7.59
8,DC,89.18,9.34,D,79.84
9,FL,47.09,52.1,R,5.01


In [11]:
# negative represents democratic, positive represents republican
party_margin = []
for i in range(len(pres_2004_df_dr)):
    if pres_2004_df_dr['winner'][i] == 'D':
        party_margin.append(-pres_2004_df_dr['margin_percentage'][i])
    if any(pres_2004_df_dr['winner'][i] in x for x in ['E', 'R']):
        party_margin.append(pres_2004_df_dr['margin_percentage'][i])
results_party_margin = pres_2004_df_dr
results_party_margin['2004_results_margin'] = party_margin
results_party_margin

Unnamed: 0,state,dem_percentage,gop_percentage,winner,margin_percentage,2004_results_margin
0,AL,36.84,62.46,R,25.62,25.62
1,AK,35.52,61.07,R,25.55,25.55
2,AZ,44.4,54.87,R,10.47,10.47
3,AR,44.55,54.31,R,9.76,9.76
4,CA,54.31,44.36,D,9.95,-9.95
5,CO,47.02,51.69,R,4.67,4.67
6,CT,54.31,43.95,D,10.37,-10.37
7,DE,53.35,45.75,D,7.59,-7.59
8,DC,89.18,9.34,D,79.84,-79.84
9,FL,47.09,52.1,R,5.01,5.01


In [12]:
# export the dataframe to a csv file
results_party_margin.to_csv('../../data/processed/2004_presidential_election.csv', index = False)