In [1]:
import os
import sys
import time
import yaml
import pandas as pd
import numpy as np

with open('../../config.local.yaml', 'r') as f:
    local_config = yaml.safe_load(f)

LOCAL_PATH = local_config['LOCAL_PATH']

sys.path.append(os.path.join(LOCAL_PATH, "src/python"))

In [2]:
meetings_df = pd.read_csv(os.path.join(LOCAL_PATH, "intermediate_data/cpc/meetings-manifest.csv"))
DATES = sorted(list(meetings_df['date']))

In [3]:
splits_df = pd.read_csv(os.path.join(
    LOCAL_PATH, "raw_data/cpc/supplemental-docs-splits.csv"
))
splits_df = splits_df.fillna('')
splits_dict = {}
for idx, row in splits_df.iterrows():
    date = row['date']
    start = row['start']
    end = row['end']
    notes = row['notes']
    splits_dict[(date, start)] = (end, notes)

In [4]:
def extract_contents(date):
    year = date[0:4]
    filename = os.path.join(LOCAL_PATH, f"intermediate_data/cpc/{year}/{date}/supplemental-docs.txt")
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    pages = text.split("<PAGE BREAK>")
    n_pages = len(pages)
    df = []
    curr_page=1
    doc_id = 0
    while (curr_page<=n_pages):
        if splits_dict.get((date, curr_page)):
            start_page = curr_page
            end_page, notes = splits_dict.get((date, curr_page))
            if not notes:
                content = "<PAGE BREAK>".join(pages[(start_page-1):(end_page)])
            else:
                content = f"SKIPPED ({notes})"
        else:
            start_page = curr_page
            end_page = curr_page
            content = pages[start_page-1]
        df.append({
            'year': year,
            'date': date,
            'doc_id': doc_id,
            'start_page': start_page,
            'end_page': end_page,
            'content': content
        })
        doc_id+=1
        curr_page = end_page+1
    df = pd.DataFrame.from_dict(df)
    print(f"{date}: {len(df)} documents extracted")
    return df

In [5]:
t0 = time.time()
for date in DATES:
    year = date[0:4]
    df = extract_contents(date)
    df.to_pickle(os.path.join(
        LOCAL_PATH, f"intermediate_data/cpc/{year}/{date}/supplemental-docs.pkl"
    ))
t1 = time.time()
print(f"Elapsed time = {(t1-t0)/60} mintues")

2018-05-10: 26 documents extracted
2018-05-23: 10 documents extracted
2018-06-14: 23 documents extracted
2018-07-12: 26 documents extracted
2018-07-26: 42 documents extracted
2018-08-09: 39 documents extracted
2018-08-23: 6 documents extracted
2018-09-13: 79 documents extracted
2018-09-27: 13 documents extracted
2018-10-11: 67 documents extracted
2018-10-25: 53 documents extracted
2018-11-08: 10 documents extracted
2018-11-29: 16 documents extracted
2018-12-13: 21 documents extracted
2018-12-20: 10 documents extracted
2019-01-10: 18 documents extracted
2019-01-24: 40 documents extracted
2019-02-14: 12 documents extracted
2019-02-28: 5 documents extracted
2019-03-14: 16 documents extracted
2019-03-28: 22 documents extracted
2019-04-11: 25 documents extracted
2019-05-09: 25 documents extracted
2019-05-23: 112 documents extracted
2019-06-13: 5 documents extracted
2019-06-27: 15 documents extracted
2019-07-11: 70 documents extracted
2019-07-25: 58 documents extracted
2019-08-08: 79 documen