In [None]:
import json
import requests as req
from bs4 import BeautifulSoup

In [None]:
def get_attachment_list(records_url: str) -> str:
    requests_session = req.Session()

    # Making this initial request sets up the session cookies and sets server state to the provided record
    response = requests_session.get(records_url)
    response.raise_for_status()

    # This request uses the server state from the session to retrieve the attachments list
    attachments_list_url = 'https://permits.cityofboise.org/citizenaccess/FileUpload/AttachmentsList.aspx'
    params = {
        'iframeid': 'ctl00_PlaceHolderMain_attachmentEdit',
        'module': 'Planning',
        'isInConfirm': 'False',
        'isdetail': 'True',
        'isaccountmanager': 'False',
        'isAdmin': 'False',
        'isPeopleDocument': '',
        'agencyCode': 'BOISE',
        'isForConditionDocument': 'N'
    }
    response = requests_session.get(attachments_list_url, params=params)
    response.raise_for_status()

    return response.text

In [47]:
def extract_attachments(attachment_html: str) -> list:
    soup = BeautifulSoup(attachment_html, 'html.parser')
    attachments = []

    table = soup.find('table', id='attachmentList_gdvAttachmentList')
    if table:
        # Loop through each row in the table; skip rows without <td> (header rows)
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if not cells:
                continue  # Skip header rows

            # First cell: Filename (and its associated postBack id)
            filename_anchor = cells[0].find('a')
            if filename_anchor:
                postBack_id = filename_anchor.get('id', '')
                span_name = filename_anchor.find('span')
                filename = span_name.get_text(strip=True) if span_name else 'Unknown Filename'
            else:
                postBack_id = ''
                filename = 'Unknown Filename'

            # Second cell: Attachment type
            type_span = cells[1].find('span') if len(cells) > 1 else None
            attachment_type = type_span.get_text(strip=True) if type_span else 'Unknown Type'

            # Fourth cell: Attachment date
            date_span = cells[3].find('span') if len(cells) > 3 else None
            attachment_date = date_span.get_text(strip=True) if date_span else 'Unknown Date'

            attachments.append({
                'filename': filename,
                'postBackId': postBack_id,
                'type': attachment_type,
                'date': attachment_date
            })

    return attachments

In [50]:
with open('../data/01/cup_records.json', 'r') as f:
    records = json.load(f)

In [None]:
num_records = len(records)
for idx, record in enumerate(records, start=1):
    try:
        attachment_list = get_attachment_list(record['link'])
        attachments = extract_attachments(attachment_list)
        record['attachments'] = attachments
    except Exception as e:
        print(f"Error extracting attachments for record {record['link']}: {e}")
    # Print the record ID and number of attachments
    print(f"Record ({idx} / {num_records}) ID: {record['record']}  Num Attachments: {len(attachments)}")

In [53]:
with open('../data/01/cup_records_with_attachments.json', 'w') as f:
    json.dump(records, f, indent=4)