In [86]:
from bs4 import BeautifulSoup
from requests import get,Session
import wget
from datetime import date, timedelta, datetime
import pandas as pd
import re
import numpy as np

In [160]:
# path to download data to
local_path = 'data/SOS/raw/'

In [87]:
# Note: there's an easier way to do this starting here:
# http://www.sos.ca.gov/elections/prior-elections/statewide-election-results/
# oh well.
#First, connect to the page listing all the elections
sess = Session()
url = 'http://www.sos.ca.gov/elections/prior-elections/prior-statewide-elections/'
response = sess.get(url=url)
# grab the html data
html_soup_index = BeautifulSoup(response.text, 'html.parser')

# tags containing links to elections in a given year
election_years = html_soup_index.find_all('ul',class_='dlbSpc')

# substrings for building specific election urls
url_base = 'http://www.sos.ca.gov'
url_suffix = 'statement-vote/'

# loop through election years
# (note that xls files only go back to 2002)
should_break = False
for y_idx, year in enumerate(election_years):
    # exit if we come up empty
    if should_break:
        break
        
    # loop through elections within a year
    for e_idx, election in enumerate(year.find_all('li')):
        # extract link to election results (watching out for edge cases)
        url_path = election.a['href']
        url_path = url_path.replace('/special-elections/','/statewide-election-results/')
        if url_suffix not in url_path:
            url_path += url_suffix
            
        # navigate to election results
        print('\n'+url_base+url_path)
        response = sess.get(url=url_base+url_path)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        
        # find the header that precedes links to ballot measure results
        measures_header = html_soup.find_all('h2',string=re.compile('[Mm]easure'))
        if len(measures_header) == 0: # try again with 'h3'
            measures_header = html_soup.find_all('h3',string=re.compile('[Mm]easure'))
        if len(measures_header) == 0: # continue to next election if no match
            print("Couldn't find Ballot Measures section")
            continue
        
        # first <li> tag should have links to county-level results; try to grab the xls file link
        remote_filename = measures_header[0].find_next('li').find('a',href=re.compile('\.xls'))
        if remote_filename is None: # break out of all loops if no xls file (won't be any more)
            print("Couldn't find Ballot Measures xls file; exiting")
            should_break = True
            break
        remote_filename = remote_filename['href']
        print(remote_filename)
        
        # reformat election info into a local filename like "%Y%m%d_measures_[election-type].xls[x]",
        # where "[election_type]" comes from remote path (e.g. "presidential-primary-election"),
        # though these are unfortunately not standardized.
        election_info = re.split('\/',url_path)[-3]
        election_info = re.split('-',election_info)
        date_str = datetime.strptime('-'.join(election_info[-3:]),'%B-%d-%Y').strftime('%Y%m%d')
        file_suffix = re.split('\.',remote_filename)[-1]
        filename = date_str+'_measures_'+'-'.join(election_info[:-3])+'.'+file_suffix
        print(filename)
        wget.download(remote_filename,local_path+filename)


http://www.sos.ca.gov/elections/prior-elections/statewide-election-results/general-election-november-8-2016/statement-vote/
http://elections.cdn.sos.ca.gov/sov/2016-general/ssov/ballot-measures-summary-by-county.xls
20161108_measures_general-election.xls

http://www.sos.ca.gov/elections/prior-elections/statewide-election-results/presidential-primary-election-june-7-2016/statement-vote/
http://elections.cdn.sos.ca.gov/sov/2016-primary/ssov/09-ballot-measures-summary.xls
20160607_measures_presidential-primary-election.xls

http://www.sos.ca.gov/elections/prior-elections/statewide-election-results/general-election-november-4-2014/statement-vote/
http://elections.cdn.sos.ca.gov/sov/2014-general/ssov/ballot-measures-summary.xls
20141104_measures_general-election.xls

http://www.sos.ca.gov/elections/prior-elections/statewide-election-results/statewide-direct-primary-election-june-3-2014/statement-vote/
http://elections.cdn.sos.ca.gov/sov/2014-primary/ssov/ballot-measures-summary.xls
2014060

In [161]:
# First, connect to the page listing all the voter registration records
sess = Session()
url = 'http://www.sos.ca.gov/elections/voter-registration/voter-registration-statistics/'
response = sess.get(url=url)

# grab the html data
html_soup_index = BeautifulSoup(response.text, 'html.parser')
url_base = 'http://www.sos.ca.gov'

# loop over tags containing links to voter registration records
registration_records = html_soup_index.find_all('a',title=re.compile('Report of Registration'))
for tag in registration_records:
    print('\n'+tag['title'])
    date_str = datetime.strptime(re.split('-',tag['title'])[-1],' %B %d, %Y').strftime('%Y%m%d')
    url = tag['href']
    if url[:4] != 'http':
        url = url_base+url
    print(url)
    filename = date_str+'_voter-registration'
    # navigate to page, grab html
    response = sess.get(url=url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    should_break_inner=False
    for li in html_soup.find_all('li'):
        if should_break_inner: break
        for strong in li.find_all('strong'):  
            match = re.search('[Rr]egistration by [Cc]ounty',strong.text)
            if match and li.find('a',href=re.compile('\.xls')) is not None:
                remote_filename = li.find('a',href=re.compile('\.xls'))['href']
                filename += '.'+re.split('\.',remote_filename)[-1]
                print(remote_filename)
                print(filename)
                wget.download(remote_filename,local_path+filename)
                should_break_inner = True
                break
    



Report of Registration - January 2, 2018
http://www.sos.ca.gov/elections/report-registration/report-registration-january-2-2018/
http://elections.cdn.sos.ca.gov/ror/154day-stwddirprim-2018/county.xls
20180102_voter-registration.xls

Report of Registration - February 10, 2017
http://www.sos.ca.gov/elections/report-registration/ror-odd-year-2017/
http://elections.cdn.sos.ca.gov/ror/ror-pages/ror-odd-year-2017/county.xlsx
20170210_voter-registration.xlsx

Report of Registration - October 24, 2016
http://www.sos.ca.gov/elections/report-registration/15day-general-2016/
http://elections.cdn.sos.ca.gov/ror/ror-pages/15day-gen-16/county.xls
20161024_voter-registration.xls

Report of Registration - September 9, 2016
http://www.sos.ca.gov/elections/report-registration/60day-general-2016/
http://elections.cdn.sos.ca.gov/ror/ror-pages/60day-gen-16/county.xls
20160909_voter-registration.xls

Report of Registration - July 7, 2016
http://www.sos.ca.gov/elections/report-registration/124day-gen-16/
ht

http://elections.cdn.sos.ca.gov/ror/ror-pages/ror-odd-year-07/county.xls
20070210_voter-registration.xls

Report of Registration - October 23, 2006
http://www.sos.ca.gov/elections/report-registration/ror-102306/
http://elections.cdn.sos.ca.gov/ror/ror-pages/154day-prim-06/county.xls
20061023_voter-registration.xls

Report of Registration - September 8, 2006
http://www.sos.ca.gov/elections/report-registration/ror-090806/
http://elections.cdn.sos.ca.gov/ror/ror-pages/60day-gen-06/county.xls
20060908_voter-registration.xls

Report of Registration - May 22, 2006
http://www.sos.ca.gov/elections/report-registration/ror-05222006/
http://elections.cdn.sos.ca.gov/ror/ror-pages/15day-prim-06/county.xls
20060522_voter-registration.xls

Report of Registration - April 7, 2006
http://www.sos.ca.gov/elections/report-registration/ror-040706/
http://elections.cdn.sos.ca.gov/ror/ror-pages/60day-prim-06/county.xls
20060407_voter-registration.xls

Report of Registration - January 3, 2006
http://www.sos.ca

In [155]:
should_break=False
for li in html_soup.find_all('li'):
    if should_break: break
    for strong in li.find_all('strong'):  
        match = re.search('[Rr]egistration by [Cc]ounty',strong.text)
        if match:
            print(strong.find('a',href=re.compile('\.xls'))['href'])
            should_break = True
            break

http://elections.cdn.sos.ca.gov/ror/ror-pages/154day-presprim-04/county.xls
