In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
from collections import OrderedDict
from math import ceil, floor
from operator import itemgetter

## Scrape Wikipedia Page

In [2]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL',
          'GA','HI','ID','IL','IN','IA','KS','KY','LA','ME',
          'MD','MA','MI','MN','MS','MO','MT','NE','NV','NH',
          'NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
          'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY', 'US']

def append_sum(ec):
    ec.append(sum(ec))

ec_votes_2000=[9,  3,  8,  6,  54, 8,  8,  3,  3,  25,
          13, 4,  4,  22, 12, 7,  6,  8,  9,  4,
          10, 12, 18, 10, 7,  11, 3,  5,  4,  4,
          15, 5,  33, 14, 3,  21, 8,  7,  23, 4,
          8,  3,  11, 32, 5,  3,  13, 11, 5,  11, 3]
append_sum(ec_votes_2000)
ec_votes_2012=[9,  3,  11, 6,  55, 9,  7,  3,  3,  29,
          16, 4,  4,  20, 11, 6,  6,  8,  8,  4,
          10, 11, 16, 10, 6,  10, 3,  5,  6,  4,
          14, 5,  29, 15, 3,  18, 7,  7,  20, 4,
          9,  3,  11, 38, 6,  3,  13, 12, 5,  10, 3]
append_sum(ec_votes_2012)
ec_votes = list(zip(ec_votes_2000, ec_votes_2012))

ec_votes_dict = OrderedDict()
for i in range(len(states)):
    ec_votes_dict[states[i]] = ec_votes[i]
ec_votes = pd.DataFrame.from_dict(ec_votes_dict, orient='index')
ec_votes.rename(columns={0: '2000', 1: '2012'}, inplace=True)
ec_votes['2016']=ec_votes['2012']

In [3]:
ec_votes.tail()

Unnamed: 0,2000,2012,2016
WA,11,12,12
WV,5,5,5
WI,11,10,10
WY,3,3,3
US,538,538,538


In [4]:
def clean(count_text):
    return count_text.strip().replace(',','').replace('.', '').replace('-','0').replace(
                'N/A', '0').replace('★', '').replace('–', '')

In [5]:
def read_in_data(year, num_candidates, total_column):
    page = BeautifulSoup(urllib.request.urlopen(
            'https://en.wikipedia.org/wiki/United_States_presidential_election,_'+year).read(), "html.parser")
    header = page.find('span', id='Results_by_state').parent
    table = header.find_next_sibling('div').table
    head_row = table.find('tr')
    headings = []
    for cell in head_row.find_all('th', attrs={'colspan': '3'}):
        children = list(cell.children)
        if len(children) == 3:
            headings.append((children[0].strip(), children[2].strip()))
        else:
            headings.append((children[0], None))
    headings = headings[:num_candidates]
    headings.append(('Total', None))
    data = OrderedDict()
    all_rows = list(table.find_all('tr'))
    columns_with_vote_counts = list(range(2, 3*num_candidates, 3))
    abbr_column = total_column + 1
    columns_with_vote_counts.append(total_column)
    for row in all_rows[2:-1]:
        cells = list(row.find_all('td'))
        abbr = cells[abbr_column].string.strip()
        if len(abbr)==2 or len(abbr)==6:
            abbr = abbr[:2]
            data[abbr] = [clean(cells[i].text) for i in columns_with_vote_counts]
    cells = all_rows[-1].find_all('th')
    abbr = cells[abbr_column].string.strip()
    data[abbr] = [clean(cells[i].text) for i in columns_with_vote_counts]
    for district, votes in data.items():
        data[district] = [int(i) if len(i) else 0 for i in votes]
    abbreviations  = []
    for h in headings:
        abbreviations.append(''.join([s[0] for s in h[0].split()]))
    vote_data = pd.DataFrame.from_dict(data, orient='index')
    vote_data.columns=abbreviations
    vote_data['E'] = ec_votes[year]
#     vote_data['E'] = pd.DataFrame.from_dict(ec_votes_dict, orient='index')[0]
    return vote_data

In [6]:
vote_data_2000 = read_in_data('2000', 8, 28)

In [7]:
vote_data_2012 = read_in_data('2012', 5, 19)

In [8]:
vote_data_2016 = read_in_data('2016', 6, 20)

In [9]:
vote_data_2000.tail()

Unnamed: 0,GWB,AG,RN,PB,HB,HP,JH,O,T,E
WA,1108864,1247652,103002,7171,13135,1989,2927,2693,2487433,11
WV,336475,295497,10680,3169,1912,23,367,1,648124,5
WI,1237279,1242987,94070,11471,6640,2042,853,3265,2598607,11
WY,147947,60481,4625,2724,1443,720,411,0,218351,3
US,50456002,50999897,2882955,448895,384431,98020,83714,51186,105405100,538


In [10]:
vote_data_2012.tail()

Unnamed: 0,BO,MR,GJ,JS,O,T,E
WA,1755396,1290670,42202,20928,16320,3125516,12
WV,238269,417655,6302,4406,3806,670438,5
WI,1620985,1407966,20439,7665,11379,3068434,10
WY,69286,170962,5326,0,3487,249061,3
US,65915795,60933504,1275971,469627,490512,129085409,538


In [11]:
vote_data_2016.tail()

Unnamed: 0,HC,DT,GJ,JS,EM,O,T,E
WA,1742718,1221747,160879,58417,0,25453,3209214,12
WV,188794,489371,23004,8075,0,3807,713051,5
WI,1382536,1405284,106674,31072,11855,38729,2976150,10
WY,55973,174419,13287,2515,0,9655,255849,3
US,65853625,62985105,4489233,1457222,728860,1579672,137098443,538


## Export data

In [None]:
filename = 'state-vote-data-2000.csv'
vote_data.to_csv(filename,sep=',')

In [None]:
ec_votes = OrderedDict()
wasted = OrderedDict()
for i in range(51):
    st = vote_data.iloc[i]
    c = OrderedDict(st.loc['GWB':'O'])
    e = OrderedDict()
    r = {}
    E = st.loc['E']
    V = st.loc['T']
    if V == 0:
        V = sum(list(st)[:6])
    for candidate, pop_votes in c.items():
        e[candidate] = floor(E * pop_votes / V)
        r[candidate] = ceil(pop_votes - V * e[candidate] / E)
    r = OrderedDict(sorted(r.items(), key=itemgetter(1), reverse=True))
    remainder = E - sum(e.values())
    for candidate in r.keys():
        if candidate != 'O': # not mappable to a single candidate
            e[candidate] += 1
            remainder -= 1
            r[candidate] = 0
            if remainder == 0:
                break
    ec_votes[st.name] = e
    wasted[st.name] = r
ec_votes = pd.DataFrame.from_dict(ec_votes, orient='index')
wasted = pd.DataFrame.from_dict(wasted, orient='index')
print('Electoral College Votes by State:\n{}\n'.format(ec_votes))
print('Wasted Popular Votes by State:\n{}\n'.format(wasted))
print('Electoral College Tally:\n{}\n'.format(ec_votes.sum()))
print('Wasted popular votes:\n{}'.format(wasted.sum()))