# Parse PDFs

This notebook has code to parse a PDF file in the `pdfs` folder into a CSV file in the `data` folder. (I do these one month at a time, spot-checking the data as I go.)

In [3]:
import glob
import os
import csv

import pdfplumber
import pandas as pd

from const import csv_headers, address_fixes

In [None]:
%run download

In [4]:
date = '2025-04'

In [5]:
filepath_in = f'pdfs/{date}.pdf'
filepath_out = f'data/{date}.csv'

year, month = date.split('-')

if os.path.exists(filepath_out):
    raise Exception(f'CSV file exists! {filepath_out}')

# open the PDF with pdfplumber
with pdfplumber.open(filepath_in) as pdf:

    # empty list to hold the extracted data
    table_data = []

    # loop over the PDF pages
    for page in pdf.pages:

        # some reports have 2 tables on the last page,
        # so you need to make sure you end up with the first one
        table = page.extract_tables()[0] 

        # add the data to the tracking list
        if table:
            table_data.extend(table)

# an empty list to hold the parsed data
data_parsed = []

# new style of reports starting january 2025
# loop over the rows of raw data
for line in table_data[1:]:

    # skip empty rows
    if not any(line):
        continue

    if 'permits issued' in line[0].lower():
        break

    if  len(line) < 8:
        continue

    permit_id, applicant, address, const_type, valuation, permit_fee, contractor, jurisdiction = line

    if jurisdiction.lower().strip() == 'city':
        outside = False
    else:
        outside = True

    cost_approx = "".join(
        valuation.replace('$', '').replace(',', '').split()        
    )

    if cost_approx and cost_approx != "-":
        cost_approx = float(cost_approx)
    
    permit_fee = "".join(
        permit_fee.replace('$', '').replace(',', '').split()        
    )

    if permit_fee and permit_fee != "-":
        permit_fee = float(permit_fee)

    data = {
        'site_address': ' '.join(address.split()),
        'contractor': ' '.join(contractor.split()),
        'year': year,
        'month': month,
        'permit_number': permit_id,
        'applicant_name': ' '.join(applicant.split()),
        'construction_type': ' '.join(const_type.split()),
        'cost_approximate': cost_approx,
        'permit_fee': permit_fee,
        'hookup_fee': '',
        'outside_city_limits': outside
    }

    print(data)
    
    data_parsed.append(data)

{'site_address': '544 MAIN ST, SPEARFISH, SD 57783', 'contractor': 'Rapid Fire Protection', 'year': '2025', 'month': '04', 'permit_number': 'CAA-24-13', 'applicant_name': 'Casey Baldwin', 'construction_type': 'Commercial Alterations, Additions, or Tenant Improvements', 'cost_approximate': 23260.0, 'permit_fee': 754.13, 'hookup_fee': '', 'outside_city_limits': False}
{'site_address': '3025 1st Avenue, Unit Suite 2&3, Spearfish, SD 57783', 'contractor': 'Robinson Builders Inc', 'year': '2025', 'month': '04', 'permit_number': 'CAA-25-11', 'applicant_name': 'Jerry Robinson', 'construction_type': 'Commercial Alterations, Additions, or Tenant Improvements', 'cost_approximate': 123687.0, 'permit_fee': 1164.73, 'hookup_fee': '', 'outside_city_limits': False}
{'site_address': '1400 NORTH AVE, SPEARFISH, SD 57783', 'contractor': 'Black Hills Exteriors', 'year': '2025', 'month': '04', 'permit_number': 'CAA-25-20', 'applicant_name': 'Janet Zetah', 'construction_type': 'Commercial Alterations, Addi

In [7]:
# write the records to file -- be sure to check the
# record count against the live PDF and check for accuracy
with open(filepath_out, 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=csv_headers)
    writer.writeheader()
    writer.writerows(data_parsed)

print(f'Wrote {len(data_parsed)} records to file: {filepath_out}')

Wrote 30 records to file: data/2025-04.csv


In [2]:
%run combine

Wrote 5,229 permit records to file: spearfish-building-permits.csv
