In [1]:
import os
import sys
import yaml
from pathlib import Path
import pandas as pd
import numpy as np

with open('../../config.local.yaml', 'r') as f:
    local_config = yaml.safe_load(f)

LOCAL_PATH = local_config['LOCAL_PATH']

sys.path.append(os.path.join(LOCAL_PATH, "src/python"))

In [2]:
splits_df = pd.read_csv(os.path.join(
    LOCAL_PATH, "raw_data/cpc/supplemental-docs-splits.csv"
))
SPLITS_DATE_LIST = sorted(list(splits_df['date'].unique()))

In [3]:
df = []
ROOT_DIR = Path(os.path.join(LOCAL_PATH, "intermediate_data/cpc"))
YEARS = sorted([d.name for d in ROOT_DIR.iterdir() if d.is_dir()])
for year in YEARS:
    YEAR_DIR = Path(os.path.join(ROOT_DIR, str(year)))
    DATES = sorted([d.name for d in YEAR_DIR.iterdir() if d.is_dir()])
    for date in DATES:
        date_path = os.path.join(LOCAL_PATH, f"intermediate_data/cpc/{year}/{date}")
        agenda_file = os.path.join(date_path, "agenda.txt")
        minutes_file = os.path.join(date_path, "minutes.txt")
        supdocs_file = os.path.join(date_path, "supplemental-docs.txt")
        if not os.path.exists(agenda_file):
            continue
        if not os.path.exists(minutes_file):
            continue
        if not os.path.exists(supdocs_file):
            continue
        with open(agenda_file, 'r', encoding='utf-8') as f:
            text = f.read()
            agenda_pages = len(text.split("<PAGE BREAK>"))
        with open(minutes_file, 'r', encoding='utf-8') as f:
            text = f.read()
            minutes_pages = len(text.split("<PAGE BREAK>"))
        with open(supdocs_file, 'r', encoding='utf-8') as f:
            text = f.read()
            supdocs_pages = len(text.split("<PAGE BREAK>"))
        df.append({
            'year': year,
            'date': date,
            'agenda_pages': agenda_pages,
            'minutes_pages': minutes_pages,
            'supdocs_pages': supdocs_pages
        })
df = pd.DataFrame.from_dict(df)

In [4]:
DATE_LIST = sorted(list(df['date'].unique()))

In [5]:
set_diff = set(DATE_LIST).difference(set(SPLITS_DATE_LIST))
if len(set_diff)>0:
    print("Warning: These dates have all three files but are not contained in supplemental-docs-splits.csv")
    print(sorted(list(set_diff)))

In [6]:
df.to_csv(
    os.path.join(LOCAL_PATH, "intermediate_data/cpc/meetings-manifest.csv"),
    header = True,
    index = False
)