In [1]:
import os
import sys
import re
import time
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt

sys.path.append('../python')
warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

DATA_DIR = Path('../../raw_data/cpc')

import api
import text_tools as tt

import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))


  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package stopwords to /Users/ekung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
meetings_meta_df = pd.read_csv("../../intermediate_data/cpc/meetings_metadata.csv")

In [3]:
for idx, row in meetings_meta_df.iterrows():
    year = row['year']
    date = row['date']
    print(f"{date}... ", end='')

    # Agenda full text
    try:
        with open(f"../../intermediate_data/cpc/{year}/{date}/agenda-manual-override.txt", 'r') as f:
            agenda_text = f.read()
    except:
        with open(f"../../intermediate_data/cpc/{year}/{date}/agenda.txt", 'r') as f:
            agenda_text = f.read()
    # Agenda item summaries
    try:
        with open(f"../../intermediate_data/cpc/{year}/{date}/agenda-item-summaries-manual-override.txt", 'r') as f:
            agenda_items = f.read()
    except:
        with open(f"../../intermediate_data/cpc/{year}/{date}/agenda-item-summaries.txt", 'r') as f:
            agenda_items = f.read()

    # Construct data frame with year, date, item_no, title, summary
    items = agenda_items.split('------')
    df = []
    for item in items:
        if len(item.strip())>0:
            parts = item.strip().split('\n')
            item_no = re.search(r'ITEM NO:\s*(.*)', parts[0].strip()).group(1)
            title = re.search(r'TITLE:\s*(.*)', parts[1].strip()).group(1)
            summary = re.search(r'SUMMARY:\s*(.*)', parts[2].strip()).group(1)
            title_tokz = [tok for tok in title.lower().split()]
            is_cc_heading = (title_tokz[0]=='consent') and (title_tokz[1]=='calendar')
            is_cc_part = (not item_no.isdigit())
            is_casenum = tt.is_casenum(title)
            df.append({
                'year': year,
                'date': date,
                'item_no': item_no,
                'title': title,
                'summary': summary,
                'is_cc_heading': is_cc_heading,
                'is_cc_part': is_cc_part,
                'is_casenum': is_casenum
            })
    # Find the start line of each item
    agenda_text_lines = agenda_text.split('\n')
    for i in range(len(df)):
        item_no = df[i]['item_no']
        title = df[i]['title']
        pattern = rf"{re.escape(item_no.lower())}[\s.]*{re.escape(title.lower())}"
        j=0
        done = False
        while (not done) and (j<len(agenda_text_lines)):
            line = agenda_text_lines[j].lower()
            if re.search(pattern, line):
                df[i]['start_line'] = j
                done = True
            elif re.search(pattern, re.sub(r'\s+', ' ', line).strip()):
                df[i]['start_line'] = j
                done = True
            else:
                # check if all non-stopwords in title are contained in line
                line_nopunc = re.sub(r'[^\w\s]', '', line)
                title_nopunc = re.sub(r'[^\w\s]', '', title)
                line_tokens = [tok for tok in line_nopunc.lower().split() if tok not in stop_words]
                title_tokens = [tok for tok in title_nopunc.lower().split() if tok not in stop_words]
                no_in_line = (len(line_tokens)>0) and (f"{item_no}"==line_tokens[0])
                title_in_line = set(title_tokens) <= set(line_tokens)
                if (no_in_line and title_in_line):
                    df[i]['start_line'] = j
                    done = True
            j+=1
        if j==len(agenda_text_lines):
            print("error")
            print(item_no, title)
            raise RuntimeError("Line containing agenda item not found")
    # Check if start line is monotonic
    start_line = 0
    for i in range(len(df)):
        if df[i]['start_line'] < start_line:
            raise RuntimeError("start lines not monotonic")
        start_line = df[i]['start_line']
    # Find the end line of each item
    for i in range(len(df)):
        if i==len(df)-1:
            j=df[i]['start_line']+1
            done = False
            while (not done) and (j<len(agenda_text_lines)):
                line = agenda_text_lines[j]
                if ('next' in line.lower()) and ('meeting' in line.lower()) and ('city' in line.lower()) and ('planning' in line.lower()) and ('commission' in line.lower()):
                    df[i]['end_line'] = j-1
                    done = True
                j+=1
            if j==len(agenda_text_lines):
                raise RuntimeError("Line containing next city planning commission meeting not found")
        else:
            df[i]['end_line'] = df[i+1]['start_line']-1
    # Extract the content of each item
    for i in range(len(df)):
        start_line = df[i]['start_line']
        end_line = df[i]['end_line']
        content = '\n'.join(agenda_text_lines[start_line:end_line+1])
        df[i]['content'] = content
    
    df = pd.DataFrame.from_dict(df)
    df.to_pickle(f"../../intermediate_data/cpc/{year}/{date}/agenda-items.pkl")


2018-05-10... 2018-05-23... 2018-06-14... 2018-07-12... 2018-07-26... 2018-08-09... 2018-08-23... 2018-09-13... 2018-09-27... 2018-10-11... 2018-10-25... 2018-11-08... 2018-11-29... 2018-12-13... 2018-12-20... 2019-01-10... 2019-01-24... 2019-02-14... 2019-02-28... 2019-03-14... 2019-03-28... 2019-04-11... 2019-05-09... 2019-05-23... 2019-06-13... 2019-06-27... 2019-07-11... 2019-07-25... 2019-08-08... 2019-08-22... 2019-09-12... 2019-09-26... 2019-10-10... 2019-10-24... 2019-11-14... 2019-11-21... 2019-12-12... 2019-12-19... 2020-01-09... 2020-01-23... 2020-02-13... 2020-03-12... 2020-04-23... 2020-05-14... 2020-05-28... 2020-06-04... 2020-06-11... 2020-06-25... 2020-07-09... 2020-07-23... 2020-08-13... 2020-08-27... 2020-09-10... 2020-09-17... 2020-09-24... 2020-10-08... 2020-10-15... 2020-10-22... 2020-11-05... 2020-11-19... 2020-12-03... 2020-12-10... 2020-12-17... 2021-01-14... 2021-01-28... 2021-02-11... 2021-02-18... 2021-02-25... 2021-03-11... 2021-03-25... 2021-04-08... 2021-0

In [8]:
date = meetings_meta_df.sample(1).iloc[0]['date']
year = date[0:4]
df = pd.read_pickle(f"../../intermediate_data/cpc/{year}/{date}/agenda-items.pkl")
df

Unnamed: 0,year,date,item_no,title,summary,is_cc_heading,is_cc_part,is_casenum,start_line,end_line,content
0,2024,2024-09-26,1,Director’s Report and Commission Business,This agenda item includes the election of offi...,False,False,False,91,102,1. DIRECTOR’S REPORT AND COMMISSION BUSI...
1,2024,2024-09-26,2,Neighborhood Council Position Statements on Ag...,Neighborhood Council representatives will pres...,False,False,False,103,111,2. NEIGHBORHOOD COUNCIL POSITION STATEME...
2,2024,2024-09-26,3,General Public Comment,The Commission will provide an opportunity for...,False,False,False,112,130,3. GENERAL PUBLIC COMMENT ...
3,2024,2024-09-26,4,Reconsiderations,The Commission may reconsider actions taken on...,False,False,False,131,139,4. RECONSIDERATIONS ...
4,2024,2024-09-26,5,Consent Calendar (No Items),"There are no items on the Consent Calendar, wh...",True,False,False,140,146,5. CONSENT CALENDAR (No Items) ...
5,2024,2024-09-26,6,CPC-2024-388-CA,The proposed Resident Protections ordinance ai...,False,False,True,147,204,6. CPC-2024-388-CA ...
6,2024,2024-09-26,7,CPC-2023-7068-CA,The proposed Citywide Housing Incentive Progra...,False,False,True,205,260,7. CPC-2023-7068-CA ...
7,2024,2024-09-26,8,CPC-2024-387-CA,The proposed Housing Element Sites and Minimum...,False,False,True,261,314,8. CPC-2024-387-CA ...
