In [1]:
from bs4 import BeautifulSoup
import urllib.request
import re

In [2]:
import pandas as pd

In [3]:
fp = urllib.request.urlopen("http://www.elections.ny.gov/ENR/NYSENRAccessible.html")
mybytes = fp.read()

mystr = mybytes.decode("utf8")
fp.close()

In [4]:
len(mystr)

962853

In [5]:
soup = BeautifulSoup(mystr, 'html.parser')

In [6]:
info = [{
    "h4": h.text.strip(),
    "p": h.find_next("p").text.strip(),
    "table": h.find_next("table").text.strip(),
    "rows": [[j.text.strip() for j in i.find_all("td")] for i in h.find_next("table").find_all("tr")]} for h in soup.find_all("h4")]

In [7]:
# all h4 values
set([i["h4"] for i in info])

{'Attorney General\xa0DEM Primary',
 'Attorney General\xa0REF Primary',
 'Governor\xa0DEM Primary',
 'Lt. Governor\xa0DEM Primary',
 'State Assembly\xa0CON Primary > 118th Assembly District',
 'State Assembly\xa0CON Primary > 142nd Assembly District',
 'State Assembly\xa0CON Primary > 62nd Assembly District',
 'State Assembly\xa0DEM Primary > 104th Assembly District',
 'State Assembly\xa0DEM Primary > 107th Assembly District',
 'State Assembly\xa0DEM Primary > 121st Assembly District',
 'State Assembly\xa0DEM Primary > 124th Assembly District',
 'State Assembly\xa0DEM Primary > 136th Assembly District',
 'State Assembly\xa0DEM Primary > 137th Assembly District',
 'State Assembly\xa0DEM Primary > 18th Assembly District',
 'State Assembly\xa0DEM Primary > 20th Assembly District',
 'State Assembly\xa0DEM Primary > 30th Assembly District',
 'State Assembly\xa0DEM Primary > 33rd Assembly District',
 'State Assembly\xa0DEM Primary > 39th Assembly District',
 'State Assembly\xa0DEM Primary > 

In [55]:
ag_info = [i for i in info if i["h4"] == "Attorney General\xa0DEM Primary"]

In [56]:
prep_ag_info = [[i["h4"], i["p"], i["table"]] + r for i in ag_info for r in i["rows"][1:]]

In [57]:
ag_df = pd.DataFrame(prep_ag_info, columns = ["h4", "p", "table", "candidate", "party", "ttl", "pcnt"])

In [58]:
reg_ttl_and_county = (ag_df
    .p
    .str.replace("\r", "")
    .str.replace("\n", "")
    .str.findall(".*Active Registered Voters (\d*).*Results for (.*) (?:(?:county)|(?:Counties))", re.MULTILINE))

In [59]:
test = 'Vote for  1                Active Registered Voters 661867    \r\n    Results for New York county 1298 of 1298 precincts reporting'

In [60]:
test2 = test.replace("\r", "").replace("\n", "")

In [61]:
test2

'Vote for  1                Active Registered Voters 661867        Results for New York county 1298 of 1298 precincts reporting'

In [62]:
re.findall(".*Active Registered Voters (\d*).*Results for (.*) (?:(?:county)|(?:Counties))", test2, re.MULTILINE)

[('661867', 'New York')]

In [63]:
ag_df["ttl_reg_voters"] = reg_ttl_and_county.map(lambda x: x[0][0])
ag_df["county"] = reg_ttl_and_county.map(lambda x: x[0][1])
ag_df["ttl"] = ag_df.ttl.astype("int")

In [64]:
ag_df[~(ag_df.county == "All")].sort_values("ttl", ascending = False).p.iloc[1]

'Vote for  1                Active Registered Voters 661867    \r\n    Results for New York county 1298 of 1298 precincts reporting'

In [65]:
ag_df[~(ag_df.county == "All")].sort_values("ttl", ascending = False).iloc[1]

h4                                     Attorney General DEM Primary
p                 Vote for  1                Active Registered V...
table             Candidate Name\nParty\nTotal Votes\nVotes Perc...
candidate                                           Zephyr Teachout
party                                                           DEM
ttl                                                           99762
pcnt                                                          42.27
ttl_reg_voters                                               661867
county                                                     New York
Name: 220, dtype: object

In [66]:
ag_df.iloc[:7]

Unnamed: 0,h4,p,table,candidate,party,ttl,pcnt,ttl_reg_voters,county
0,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Sean Patrick Maloney,DEM,357462,23.85,5621811,All
1,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Letitia A. James,DEM,579298,38.66,5621811,All
2,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Leecia R. Eve,DEM,48738,3.25,5621811,All
3,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Zephyr Teachout,DEM,442114,29.5,5621811,All
4,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Blank,,58979,3.94,5621811,All
5,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Void,,9003,0.6,5621811,All
6,Attorney General DEM Primary,Vote for 1 Active Registered V...,Candidate Name\nParty\nTotal Votes\nVotes Perc...,Write-in,,2968,0.2,5621811,All


Check: do the sums of ttl voters for counties match the "All" count?

In [67]:
ag_df[~(ag_df.county == "All")].groupby(["candidate"], as_index = False)["ttl"].sum()

Unnamed: 0,candidate,ttl
0,Blank,58979
1,Leecia R. Eve,48738
2,Letitia A. James,579298
3,Sean Patrick Maloney,357462
4,Void,9003
5,Write-in,2968
6,Zephyr Teachout,442114


In [68]:
ag_df.to_csv("./election_results.csv", index = False)