In [1]:
import os
import sys
import time
import yaml
import pandas as pd
import numpy as np
import json
import pickle

with open('../../config.local.yaml', 'r') as f:
    local_config = yaml.safe_load(f)

LOCAL_PATH = local_config['LOCAL_PATH']

sys.path.append(os.path.join(LOCAL_PATH, "src/python"))

from scrapers import request_url_text
from utils import canonicalize_casenum, is_casenum
import data_tools as dt

rng = np.random.default_rng(20250611)

OVERWRITE = False
VERBOSE = True
WAIT_MIN = 1
WAIT_MAX = 2

SEARCH_URL = "https://planning.lacity.org/pdiscaseinfo/api/Service/SearchCaseNumber"
INFO_URL = "https://planning.lacity.org/pdiscaseinfo/api/Service/GetCaseInfoDataEncoded"

TEMPFILE = "temp.pkl"

In [2]:
def get_id_from_caseno(caseno):
    # get PDIS internal caseid from planning dept case number
    caseno1 = caseno
    caseno2 = '-'.join(caseno.split('-')[0:3])
    caseno3 = canonicalize_casenum(caseno)
    try_urls = [
        f"{SEARCH_URL}?caseNo={caseno1}",
        f"{SEARCH_URL}?caseNo={caseno2}",
        f"{SEARCH_URL}?caseNo={caseno3}"
    ]
    for url in try_urls:
        response = request_url_text(
            url, overwrite=OVERWRITE, verbose=VERBOSE, wait=rng.uniform(WAIT_MIN, WAIT_MAX)
        )
        j = json.loads(response)
        for item in j:
            if canonicalize_casenum(caseno)==canonicalize_casenum(item['caseNbr']):
                return item['encodedCaseId']
    print(f"Warning: failed to retrieve data for {caseno}")
    return None

In [3]:
def get_case_info(caseno):
    caseid = get_id_from_caseno(caseno)
    url = f"{INFO_URL}?encodedCaseId={caseid}"
    response = request_url_text(
        url, overwrite=OVERWRITE, verbose=VERBOSE, wait=rng.uniform(WAIT_MIN, WAIT_MAX)
    )
    if response is None:
        return None
    j = json.loads(response)
    if (j is None) or (len(j)==0) or (j['caseNbr']==None):
        return None
    return j

In [4]:
df = dt.get_minutes(verbose=False)

In [5]:
case_list = []
for idx, row in df.iterrows():
    title = row['title']
    cases = row['related_cases'].split(',')
    for caseno in [title]+cases:
        if is_casenum(caseno.strip()):
            case_list.append(caseno.strip())
case_list = sorted(list(set(case_list)))
len(case_list)

1124

In [6]:
try:
    with open(TEMPFILE, 'rb') as f:
        curr_count = pickle.load(f)
except:
    curr_count = 0
print(f"curr_count = {curr_count}")

curr_count = 0


In [7]:
out_df = []
count = 0
for case in case_list:
    j = get_case_info(case)
    count+=1
    if j is not None:
        j['casenum'] = case
        out_df.append(j)
    if count>(curr_count+2000):
        break

curr_count = count
with open(TEMPFILE, 'wb') as f:
    pickle.dump(curr_count, f)
print(f"curr_count = {curr_count}")

curr_count = 1124


In [8]:
out_df = pd.DataFrame.from_dict(out_df)
out_df.to_pickle(os.path.join(LOCAL_PATH, "intermediate_data/cpc/case-data.pkl"))
len(out_df)

1113

In [9]:
# Ensure the case numbers line up
out_df['bad'] = False
for idx, row in out_df.iterrows():
    caseNbr = row['caseNbr']
    casenum = row['casenum']
    if canonicalize_casenum(caseNbr)!=canonicalize_casenum(casenum):
        out_df.loc[idx, 'bad'] = True
assert out_df['bad'].sum()==0

In [10]:
# List agenda items where the case number wasn't found during scraping
df2 = df.merge(
    out_df[['caseNbr','currentCaseStatus','casenum']],
    left_on='title',
    right_on='casenum',
    how='left'
)
idx = df2['caseNbr'].isna()
df2.loc[idx, ['date', 'item_no', 'title', 'related_cases', 'caseNbr']]

Unnamed: 0,date,item_no,title,related_cases,caseNbr
83,2019-01-10,6,ADM-2018-5752-DB-SIP,ADM-2018-5752-DB-SIP,
415,2021-12-09,10,ADM-2021-3739-DB-HCA-1A,ADM-2021-3739-DB-HCA-1A,
478,2022-07-14,8,ADM-2021-10304-DB-HCA-1A,ADM-2021-10304-DB-HCA-1A,
553,2023-04-27,6,ADM-2022-6793-DB-HCA-1A,ADM-2022-6793-DB-HCA-1A,
643,2024-02-22,8,ADM-2023-5502-DB-HCA-1A,ADM-2023-5502-DB-HCA-1A,


In [14]:
out_df['filing_date'] = pd.to_datetime(out_df['filingDt'], errors='coerce')
out_df['primaryCnclDistNbr'].fillna('NA').value_counts()

primaryCnclDistNbr
NA           264
CD 13        143
CD 14        105
CD 5          89
CD 11         72
CD 10         63
CD 4          60
CD 2          49
CD 1          47
CD 3          35
CD 12         33
CD 6          31
CD 8          30
CD 9          30
CD 7          26
CD 15         20
MULTIPLE       4
CITYWIDE       3
CD 4,CD 5      3
CITYW          3
CD 8,CD 9      3
Name: count, dtype: int64