In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
from collections import OrderedDict

## Scrape Wikipedia Page

In [2]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL',
          'GA','HI','ID','IL','IN','IA','KS','KY','LA','ME',
          'MD','MA','MI','MN','MS','MO','MT','NE','NV','NH',
          'NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
          'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY', 'US']

ec_votes=[9,  3,  8,  6,  54, 8,  8,  3,  3,  25,
          13, 4,  4,  22, 12, 7,  6,  8,  9,  4,
          10, 12, 18, 10, 7,  11, 3,  5,  4,  4,
          15, 5,  33, 14, 3,  21, 8,  7,  23, 4,
          8,  3,  11, 32, 5,  3,  13, 11, 5,  11, 3]
ec_votes.append(sum(ec_votes))

ec_votes_dict = OrderedDict()
for i in range(len(states)):
    ec_votes_dict[states[i]] = ec_votes[i]

In [3]:
ec_votes_dict['US']

538

In [4]:
url = 'https://en.wikipedia.org/wiki/United_States_presidential_election,_2000'
page = BeautifulSoup(urllib.request.urlopen(url).read(), "html.parser")

In [5]:
def clean(count_text):
    return count_text.strip().replace(',','').replace('.', '').replace('-','0').replace(
                'N/A', '0').replace('★', '').replace('–', '')

header = page.find('span', id='Results_by_state').parent
table = header.find_next_sibling('div').table
head_row = table.find('tr')
headings = []
for cell in head_row.find_all('th', attrs={'colspan': '3'}):
    children = list(cell.children)
    if len(children) == 3:
        headings.append((children[0].strip(), children[2].strip()))
    else:
        headings.append((children[0], None))
headings.append(('Total', None))
data = OrderedDict()
all_rows = list(table.find_all('tr'))
columns_with_vote_counts = list(range(2,24,3))
total_column = 28
abbr_column = total_column + 1
columns_with_vote_counts.append(total_column)
for row in all_rows[2:-1]:
    cells = list(row.find_all('td'))
    abbr = cells[abbr_column].string.strip()
    if len(abbr)==2 or len(abbr)==6:
        abbr = abbr[:2]
        data[abbr] = [clean(cells[i].text) for i in columns_with_vote_counts]
cells = all_rows[-1].find_all('th')
abbr = cells[abbr_column].string.strip()
data[abbr] = [clean(cells[i].text) for i in columns_with_vote_counts]
for district, votes in data.items():
    data[district] = [int(i) if len(i) else 0 for i in votes]

In [6]:
abbreviations  = []
for h in headings:
    abbreviations.append(''.join([s[0] for s in h[0].split()]))
vote_data = pd.DataFrame.from_dict(data, orient='index')
vote_data.columns=abbreviations
vote_data['E'] = pd.DataFrame.from_dict(ec_votes_dict, orient='index')[0]
print(vote_data)

         GWB        AG       RN      PB      HB     HP     JH      O  \
AL    941173    692611    18323    6351    5893    775    447    699   
AK    167398     79004    28747    5192    2636    596    919   1068   
AZ    781652    685341    45645   12373       0    110   1120   5775   
AR    472940    422768    13421    7358    2781   1415   1098      0   
CA   4567429   5861203   418707   44987   45520  17042  10934     34   
CO    883748    738227    91434   10465   12799   1319   2240   1136   
CT    561094    816015    64452    4731    3484   9695     40     14   
DE    137288    180068     8307     777     774    208    107     93   
DC     18073    171923    10576       0     669      0      0    653   
FL   2912790   2912253    97488   17484   16415   1371   2281   3028   
GA   1419720   1116230    13432   10926   36332    140      0     24   
HI    137845    205286    21623    1071    1477    343    306      0   
ID    336937    138637    12292    7615    3488   1469   1177   

## Export data

In [7]:
filename = 'state-vote-data-2000.csv'
vote_data.to_csv(filename,sep=',')

In [8]:
from math import ceil, floor
from operator import itemgetter
ec_votes = OrderedDict()
wasted = OrderedDict()
for i in range(51):
    st = vote_data.iloc[i]
    c = OrderedDict(st.loc['GWB':'O'])
    e = OrderedDict()
    r = {}
    E = st.loc['E']
    V = st.loc['T']
    if V == 0:
        V = sum(list(st)[:6])
    for candidate, pop_votes in c.items():
        e[candidate] = floor(E * pop_votes / V)
        r[candidate] = ceil(pop_votes - V * e[candidate] / E)
    r = OrderedDict(sorted(r.items(), key=itemgetter(1), reverse=True))
    remainder = E - sum(e.values())
    for candidate in r.keys():
        if candidate != 'O': # not mappable to a single candidate
            e[candidate] += 1
            remainder -= 1
            r[candidate] = 0
            if remainder == 0:
                break
    ec_votes[st.name] = e
    wasted[st.name] = r
ec_votes = pd.DataFrame.from_dict(ec_votes, orient='index')
wasted = pd.DataFrame.from_dict(wasted, orient='index')
print('Electoral College Votes by State:\n{}\n'.format(ec_votes))
print('Wasted Popular Votes by State:\n{}\n'.format(wasted))
print('Electoral College Tally:\n{}\n'.format(ec_votes.sum()))
print('Wasted popular votes:\n{}'.format(wasted.sum()))

Electoral College Votes by State:
    GWB  AG  RN  PB  HB  HP  JH  O
AK    2   1   0   0   0   0   0  0
AL    5   4   0   0   0   0   0  0
AR    3   3   0   0   0   0   0  0
AZ    4   4   0   0   0   0   0  0
CA   23  29   2   0   0   0   0  0
CO    4   3   1   0   0   0   0  0
CT    3   5   0   0   0   0   0  0
DC    0   3   0   0   0   0   0  0
DE    1   2   0   0   0   0   0  0
FL   12  12   1   0   0   0   0  0
GA    7   6   0   0   0   0   0  0
HI    2   2   0   0   0   0   0  0
IA    3   4   0   0   0   0   0  0
ID    3   1   0   0   0   0   0  0
IL    9  12   1   0   0   0   0  0
IN    7   5   0   0   0   0   0  0
KS    4   2   0   0   0   0   0  0
KY    5   3   0   0   0   0   0  0
LA    5   4   0   0   0   0   0  0
MA    4   7   1   0   0   0   0  0
MD    4   6   0   0   0   0   0  0
ME    2   2   0   0   0   0   0  0
MI    8   9   1   0   0   0   0  0
MN    5   5   0   0   0   0   0  0
MO    6   5   0   0   0   0   0  0
MS    4   3   0   0   0   0   0  0
MT    2   1   0   0  