In [1]:
import os
import sys
import re
import time
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt

sys.path.append('../python')
warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

DATA_DIR = Path('../../raw_data/cpc')

import api
import text_tools as tt

import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))


  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package stopwords to /Users/ekung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
meetings_meta_df = pd.read_csv("../../intermediate_data/cpc/meetings_metadata.csv")

In [3]:
for idx, row in meetings_meta_df.iterrows():
    year = row['year']
    date = row['date']
    print(f"{date}... ", end='')

    # Agenda df
    agenda_df = pd.read_pickle(f"../../intermediate_data/cpc/{year}/{date}/agenda-items.pkl")

    # Minutes full text
    try:
        with open(f"../../intermediate_data/cpc/{year}/{date}/minutes-manual-override.txt", 'r') as f:
            minutes_text = f.read()
    except:
        with open(f"../../intermediate_data/cpc/{year}/{date}/minutes.txt", 'r') as f:
            minutes_text = f.read()
    
    minutes_text_lines = minutes_text.split('\n')

    df = []
    j = 0
    for j in range(len(minutes_text_lines)):
        line = minutes_text_lines[j]
        relevant_items = []
        for idx, row in agenda_df.iterrows():
            found = False
            item_no = row['item_no']
            title = row['title']
            is_cc_heading = row['is_cc_heading']
            is_cc_part = row['is_cc_part']
            is_casenum = row['is_casenum']
            pattern = rf"^item no[\s.]*{item_no}$"
            pattern2 = rf"^item nos\..*\b{item_no}\b"
            pattern3 = rf"^item no[\s.]*0{item_no}$"
            if re.search(pattern, line.lower().strip()):
                start_line = j
                find_type = 'pattern1'
                found = True
            elif re.search(pattern2, line.lower().strip()):
                start_line = j
                find_type = 'pattern2'
                found = True
            elif re.search(pattern3, line.lower().strip()):
                start_line = j
                find_type = 'pattern3'
                found = True
            elif (is_cc_part) and (line.lower().strip()==item_no):
                start_line = j
                find_type = '<item_no> (cc part only)'
                found = True
            if found:
                relevant_items.append(item_no)
        if len(relevant_items)>0:
            df.append({
                'year': year,
                'date': date,
                'start_line': j,
                'relevant_items': relevant_items
            })
    df = pd.DataFrame.from_dict(df)
    
    # Get the end line for each item
    df['end_line'] = -1
    df['content'] = ''
    for idx, row in df.iterrows():
        start_line = row['start_line']
        if idx==len(df)-1:
            j = row['start_line']+1
            done = False
            while (not done) and (j<len(minutes_text_lines)):
                line = minutes_text_lines[j]
                if '____' in line:
                    done = True
                elif 'no further business' in line.lower():
                    done = True
                if done:
                    df.loc[idx, 'end_line'] = j-1
                    df.loc[idx, 'content'] = '\n'.join(minutes_text_lines[start_line:j])
                j+=1
            if j==len(minutes_text_lines):
                print(item_no, title)
                raise RuntimeError("couldn't find end line")
        else:
            end_line = df.loc[idx+1, 'start_line']
            df.loc[idx, 'end_line'] = end_line-1
            df.loc[idx, 'content'] = '\n'.join(minutes_text_lines[start_line:end_line])

    # Check if all the items were found and that their content contains self's case num
    df2 = []
    for idx, row in agenda_df.iterrows():
        item_no = row['item_no']
        title = row['title']
        is_cc_heading = row['is_cc_heading']
        is_cc_part = row['is_cc_part']
        is_casenum = row['is_casenum']

        found = False
        content = ''
        for jdx, row in df.iterrows():
            relevant_items = row['relevant_items']
            if item_no in relevant_items:
                found = True
                content = content + '\n' + row['content']
        if (not is_cc_heading) and (not found):
            print(item_no, title)
            raise RuntimeError('not all agenda items found')

        tokz = [tok for tok in content.split() if tt.is_casenum(tok)]
        if (is_casenum) and (len(tokz)>0) and (title not in tokz):
            print(item_no, title)
            print(content)
            raise RuntimeError('case num not found in minutes content')

        df2.append({
            'year': year,
            'date': date,
            'item_no': item_no,
            'title': title,
            'is_cc_heading': is_cc_heading,
            'is_cc_part': is_cc_part,
            'is_casenum': is_casenum,
            'content': content
        })
    df2 = pd.DataFrame.from_dict(df2)

    df.to_pickle(f"../../intermediate_data/cpc/{year}/{date}/minutes.pkl")
    df2.to_pickle(f"../../intermediate_data/cpc/{year}/{date}/minutes-items.pkl")


2018-05-10... 2018-05-23... 2018-06-14... 2018-07-12... 2018-07-26... 2018-08-09... 2018-08-23... 2018-09-13... 2018-09-27... 2018-10-11... 2018-10-25... 2018-11-08... 2018-11-29... 2018-12-13... 2018-12-20... 2019-01-10... 2019-01-24... 2019-02-14... 2019-02-28... 2019-03-14... 2019-03-28... 2019-04-11... 2019-05-09... 2019-05-23... 2019-06-13... 2019-06-27... 2019-07-11... 2019-07-25... 2019-08-08... 2019-08-22... 2019-09-12... 2019-09-26... 2019-10-10... 2019-10-24... 2019-11-14... 2019-11-21... 2019-12-12... 2019-12-19... 2020-01-09... 2020-01-23... 2020-02-13... 2020-03-12... 2020-04-23... 2020-05-14... 2020-05-28... 2020-06-04... 2020-06-11... 2020-06-25... 2020-07-09... 2020-07-23... 2020-08-13... 2020-08-27... 2020-09-10... 2020-09-17... 2020-09-24... 2020-10-08... 2020-10-15... 2020-10-22... 2020-11-05... 2020-11-19... 2020-12-03... 2020-12-10... 2020-12-17... 2021-01-14... 2021-01-28... 2021-02-11... 2021-02-18... 2021-02-25... 2021-03-11... 2021-03-25... 2021-04-08... 2021-0

In [7]:
date = meetings_meta_df.sample(1).iloc[0]['date']
year = date[0:4]
df = pd.read_pickle(f"../../intermediate_data/cpc/{year}/{date}/minutes.pkl")
df2 = pd.read_pickle(f"../../intermediate_data/cpc/{year}/{date}/minutes-items.pkl")

df

Unnamed: 0,year,date,start_line,relevant_items,end_line,content
0,2019,2019-12-19,29,[1],42,ITEM NO...
1,2019,2019-12-19,43,[2],67,ITEM NO...
2,2019,2019-12-19,68,[3],78,ITEM NO...
3,2019,2019-12-19,79,[4],85,ITEM NO...
4,2019,2019-12-19,86,"[5a, 5b]",216,ITEM NOS. 5...
5,2019,2019-12-19,217,[6],231,ITEM NO....


In [8]:
item_no = '5a'
row = df2.loc[ df2['item_no']==item_no ].iloc[0]
print(row['date'], row['item_no'], row['title'])
print('')
print(row['content'])


2019-12-19 5a CPC-2019-3844-VZCJ-SPR


                                   ITEM NOS. 5a and 5b                               
                                  CONSENT CALENDAR                                  
                                                                                    
         MOTION:                                                                    
         Commissioner Millman moved to approve the consent calendar. The action was seconded by
         Commissioner Choe:                                                         
                                                                                    
         CPC-2019-3844-VZCJ-SPR                         Council District: 6 – Martinez
         CEQA: ENV-2019-3845-MND                        Last Day to Act: 01-20-20   
         Plan Area: Mission Hills – Panorama City – North Hills                     
                                                                                    
         PUB