In [1]:
import os
import sys
import re
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

sys.path.append('../python')
warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

import api


  from tqdm.autonotebook import tqdm


In [2]:
meta_df = pd.read_csv("../../intermediate_data/cpc/metadata.csv")

In [3]:
meta_df['agenda_pages'] = (meta_df['filename']=='agenda')*(meta_df['pages'])
meta_df['supplemental_pages'] = (meta_df['filename']=='supplemental-docs')*(meta_df['pages'])
meta_df['minutes_pages'] = (meta_df['filename']=='minutes')*(meta_df['pages'])
meta_df2 = meta_df.groupby(['year','date']).agg(
    agenda_pages = ('agenda_pages','sum'),
    supplemental_pages = ('supplemental_pages','sum'),
    minutes_pages = ('minutes_pages','sum')
).reset_index()
meta_df2['has_all'] = (meta_df2['agenda_pages']>0) & (meta_df2['supplemental_pages']>0) & (meta_df2['minutes_pages']>0)
meta_df2['total_pages'] = meta_df2['agenda_pages'] + meta_df2['supplemental_pages'] + meta_df2['minutes_pages']

In [4]:
meta_df2['total_supplemental_docs'] = 0
for idx, row in meta_df2.iterrows():
    year = row['year']
    date = row['date']
    if row['supplemental_pages']>0:
        supplemental_docs_df = pd.read_pickle(f"../../intermediate_data/cpc/{year}/{date}/supplemental-docs.pkl")
        meta_df2.loc[idx, 'total_supplemental_docs'] = len(supplemental_docs_df)    

In [5]:
idx = meta_df2['has_all']
n_meetings = len(meta_df2.loc[idx])
n_years = len(meta_df2.loc[idx,'year'].unique())
min_year = meta_df2.loc[idx,'year'].astype('int').min()
max_year = meta_df2.loc[idx,'year'].astype('int').max()
n_pages = meta_df2.loc[idx, 'total_pages'].sum()
n_supplemental_docs = meta_df2.loc[idx, 'total_supplemental_docs'].sum()

print(f"{n_meetings} meetings")
print(f"across {n_years} years from {min_year} to {max_year}")
print(f"totaling {n_pages:,g} pages of documents")
print(f"across {n_supplemental_docs:,g} supplemental documents")

150 meetings
across 7 years from 2018 to 2024
totaling 23,430 pages of documents
across 6,423 supplemental documents


In [6]:
meta_df2

Unnamed: 0,year,date,agenda_pages,supplemental_pages,minutes_pages,has_all,total_pages,total_supplemental_docs
0,2003,2003-01-09,6,0,0,False,6,0
1,2003,2003-01-23,6,0,0,False,6,0
2,2003,2003-02-13,5,0,0,False,5,0
3,2003,2003-02-27,1,0,0,False,1,0
4,2003,2003-03-13,5,0,0,False,5,0
...,...,...,...,...,...,...,...,...
569,2025,2025-02-13,8,27,0,False,35,17
570,2025,2025-02-27,6,91,0,False,97,41
571,2025,2025-03-13,7,845,0,False,852,27
572,2025,2025-03-27,8,93,0,False,101,18
