In [1]:
import os
import sys
import re
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging

sys.path.append('../python')
warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

import api

DATE = "2025-02-13"


  from tqdm.autonotebook import tqdm


In [2]:
def extract_text_from_pdf(pdf_path):
    text_pieces = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text(x_tolerance=1)
            if page_text:
                text_pieces.append(page_text.strip())
            else:
                images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
                ocr_text = pytesseract.image_to_string(images[0])
                if ocr_text:
                    text_pieces.append(ocr_text.replace('|','I').strip())
    return "\n--PAGEBREAK--\n".join(text_pieces)


In [3]:
PROMPT = f"""
The text below contains multiple pages of documents submitted to the LA City Planning Commission. 
Each page is delimited by --PAGEBREAK--.
For each page break, identify whether it delimits the beginning of a new document submission.
If the page break identifies the beginning of a new document, insert --NEW DOCUMENT-- after --PAGEBREAK--.
If the page break does not identify a new document, but rather is a continuation of the current document, do not make any modifications.
Do not change the text other than adding --NEW DOCUMENT-- tags.

Begin text:

{{}}
"""

In [4]:
def extract_data_from_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    prompt = PROMPT.format(text)
    response = api.get_gpt_completion(prompt)
    return response

In [5]:
data = extract_data_from_pdf(f"../../raw_data/cpc/{DATE}/supplemental-docs.pdf")

In [6]:
i=0
for content in data.split("--NEW DOCUMENT--"):
    directory = Path(f"../../intermediate_data/cpc/{DATE}")
    directory.mkdir(parents=True, exist_ok=True)
    with open(f"../../intermediate_data/cpc/{DATE}/supplemental-docs-{i}.txt", 'w') as f:
        f.write(content)
    i+=1
    print('-------------------------')
    print(content)

-------------------------
GENERAL INFORMATION ABOUT THE CONTENTS OF THIS FILE
Submissions by the public in compliance with the Commission Rules and Operating
Procedures (ROPs) , Rule 4.3, are distributed to the Commission and uploaded online.
Please note that “compliance” means that the submission complies with deadline, delivery
method (hard copy and/or electronic) AND the number of copies. Please review the
Commission ROPs to ensure that you meet the submission requirements. The ROPs can be
accessed at http://planning.lacity.org, by selecting “Commissions & Hearings” and
selecting the specific Commission.
All compliant submissions may be accessed as follows:
• “Initial Submissions”: Compliant submissions received no later than by end of
day Monday of the week prior to the meeting, which are not integrated by reference
or exhibit in the Staff Report, will be appended at the end of the Staff Report. The
Staff Report is linked to the case number on the specific meeting agenda.
• “Second