# Parse SD monthly voter registration report

Uses:
- `requests` to get the PDF from [the Secretary of State's website](https://sdsos.gov/elections-voting/upcoming-elections/voter-registration-totals/voter-registration-by-county.aspx)
- `pdfplumber` to parse and validate the data

In [1]:
from io import BytesIO
from datetime import datetime
import csv
from pathlib import Path

import requests
import pdfplumber

In [2]:
url = 'https://sdsos.gov/elections-voting/assets/StatewideVoterRegistrationTotals/StatewideVotersByCounty_11.5.2024.pdf'
req = requests.get(url)

In [3]:
pdf = pdfplumber.open(BytesIO(req.content))

In [4]:
# assumes a 2-page PDF with a table spanning both pages
if len(pdf.pages) != 2:
    raise Exception(f'Report page count is {len(pdf.pages)}, not 2 -- check it out: {url}')

In [5]:
lines = pdf.pages[0].extract_table() + pdf.pages[1].extract_table()

# guarding against some PDFs parsed with an extra column
lines = [x[:-1] if not x[-1] else x for x in lines]

In [6]:
lines

[['Statewide Report by County - November 5, 2024',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 ['County',
  'Democratic',
  'Libertarian',
  'Republican',
  'Independent',
  'OTH',
  'No Party\nAffiliation',
  'No Labels',
  'Inactive',
  'Total Active'],
 ['Aurora', '518', '2', '982', '230', '2', '100', '', '107', '1834'],
 ['Beadle',
  '2,501',
  '26',
  '5,113',
  '1,383',
  '41',
  '1,123',
  '1',
  '787',
  '10188'],
 ['Bennett', '685', '6', '727', '207', '2', '197', '', '190', '1824'],
 ['Bon Homme', '888', '13', '2,369', '480', '', '230', '', '291', '3980'],
 ['Brookings',
  '5,103',
  '106',
  '9,938',
  '3,624',
  '21',
  '2,794',
  '1',
  '1594',
  '21587'],
 ['Brown',
  '7,278',
  '99',
  '12,860',
  '3,582',
  '10',
  '1,950',
  '2',
  '1132',
  '25781'],
 ['Brule', '903', '15', '1,746', '466', '4', '213', '', '173', '3347'],
 ['Buffalo', '533', '2', '151', '73', '4', '94', '', '120', '857'],
 ['Butte', '777', '56', '4,572', '877', '19', '641', '', '13

In [7]:
# report date is in the first line
report_date = lines.pop(0)

In [8]:
report_date = report_date[0].split('-')[-1].strip()
report_date = datetime.strptime(report_date, '%B %d, %Y').date().isoformat()

print(report_date)

2024-11-05


In [9]:
# save the PDF
outpath = (Path('pdfs') / f'{report_date}.pdf').resolve()

with open(outpath, 'wb') as outfile:
    outfile.write(req.content)

print(f'Wrote {outpath}')

Wrote /Users/cjwinchester/sd-voter-registration-data/pdfs/2024-11-05.pdf


In [10]:
# standardizing column names
column_fixes = {
    'oth': 'other',
    'no party\naffiliation': 'npa',
    'no labels': 'no_labels',
    'npa/ind': 'npa_ind'
}

In [11]:
# headers are in the first line of the data table
headers = lines.pop(0)
headers = [column_fixes.get(x.lower(), x.lower()) for x in headers]

In [12]:
headers

['county',
 'democratic',
 'libertarian',
 'republican',
 'independent',
 'other',
 'npa',
 'no_labels',
 'inactive',
 'total active']

In [13]:
data = []

# loop over the remaining lines
for line in lines:

    first_cell = line[0].lower()

    # skip lines if repeated headers
    if 'statewide' in first_cell or 'county' in first_cell:
        continue

    # the 'Total' line in the table means we're done
    if 'total' in first_cell:
        break

    # check that the number of values matches the number of headers
    assert(len(line) == len(headers))

    # zip it up into a dict
    county_data = dict(zip(headers, line))

    # delete the aggregate value totaling everything up
    del county_data['total active']

    # cast number strings to integers
    for key in county_data:
        if key == 'county' or not county_data[key]:
            continue

        county_data[key] = int(county_data[key].replace(',', ''))

    # add the report date
    county_data['date'] = report_date
    data.append(county_data)

In [14]:
# doublecheck: 66 counties in SD
assert(len(data) == 66)

In [15]:
data[0]

{'county': 'Aurora',
 'democratic': 518,
 'libertarian': 2,
 'republican': 982,
 'independent': 230,
 'other': 2,
 'npa': 100,
 'no_labels': '',
 'inactive': 107,
 'date': '2024-11-05'}

In [16]:
# shift 'date' to the beginning of headers list for the output CSV
csv_headers = list(data[0].keys())
csv_headers = csv_headers[-1:] + csv_headers[:-1]

In [17]:
# write to file
filepath = (Path('data') / f'{report_date}.csv').resolve()

with open(filepath, 'w') as outfile:
    writer = csv.DictWriter(
        outfile,
        fieldnames=csv_headers
    )

    writer.writeheader()
    writer.writerows(data)

    print(f'Wrote file: {filepath}')

Wrote file: /Users/cjwinchester/sd-voter-registration-data/data/2024-11-05.csv


In [18]:
%run build_files.py

Party list: americans_elect, constitution, democratic, inactive, independent, libertarian, no_labels, npa, npa_ind, npa_ind_oth, other, reform, republican
--------------------
Wrote sd-voter-registration-data.csv
Wrote sd-voter-registration-data-simplified.csv
Wrote README.md
