In [1]:
import os
import sys
import time
import yaml
import numpy as np
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

with open('../../config.local.yaml', 'r') as f:
    local_config = yaml.safe_load(f)

LOCAL_PATH = local_config['LOCAL_PATH']
CHROMEDRIVER = local_config['CHROMEDRIVER']

sys.path.append(os.path.join(LOCAL_PATH, "src/python"))
from scrapers import save_url_file

APCS = {
    'City Planning Commission': 'cpc'
}
FILE_LABELS = {
    3: 'agenda.pdf',
    4: 'supplemental-docs.pdf',
    5: 'audio.pdf',
    6: 'minutes.pdf'
}
YEARS = range(2025, 2002, -1)

rng = np.random.default_rng(20250611)

In [2]:
#--- Configure the selenium chrome driver
options = Options()
options.add_argument('--headless=new')
service = Service(CHROMEDRIVER)
driver = webdriver.Chrome(service=service, options=options)

In [None]:
#--- Begin scrape

URL = 'https://planning.lacity.gov/about/commissions-boards-hearings#commissions'
driver.get(URL)
time.sleep(5) # wait for driver to be ready

for year in YEARS:
    for apc_label, apc_dir in APCS.items():
        print(f"Downloading data for {apc_label} {year}... ")

        apc_select = Select(driver.find_element(By.NAME, 'apc'))
        apc_select.select_by_visible_text(apc_label)

        year_select = Select(driver.find_element(By.NAME, 'date'))
        year_select.select_by_visible_text(str(year))
        time.sleep(3) # wait for dropdown data to load

        rows = driver.find_elements(By.XPATH, "//table//tr")

        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) < 7:
                continue

            raw_date = cells[0].text.strip()
            if not raw_date:
                continue

            hearing_date = datetime.strptime(raw_date, "%m/%d/%Y").strftime("%Y-%m-%d")
            folder_path = os.path.join(LOCAL_PATH, "raw_data", apc_dir, str(year), hearing_date)
            os.makedirs(folder_path, exist_ok=True)

            for index, filename in FILE_LABELS.items():
                links = cells[index].find_elements(By.TAG_NAME, "a")
                if not links:
                    continue

                link = links[0].get_attribute("href")
                if not link:
                    continue

                filepath = os.path.join(folder_path, filename)
                save_url_file(link, filepath, overwrite=False, verbose=True, wait=rng.uniform(0.5,1.5))
