Order of operations:

- extract-cpc-docs.ipynb
- split-cpc-docs.ipynb

In [1]:
import os
import sys
import re
import time
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

sys.path.append('../python')
warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

DATA_DIR = Path('../../intermediate_data/cpc')

In [2]:
splits_df = pd.read_csv("../../raw_data/cpc/cpc-supplemental-docs-splits.csv")
splits_df = splits_df.fillna('')
splits_dict = {}
for idx, row in splits_df.iterrows():
    date = row['date']
    start = row['start']
    end = row['end']
    notes = row['notes']
    splits_dict[(date, start)] = (end, notes)

In [3]:
def extract_contents(date):
    year = date[0:4]
    filename = f"../../intermediate_data/cpc/{year}/{date}/supplemental-docs.txt"
    with open(filename, 'r') as f:
        text = f.read()
    pages = text.split("<PAGE BREAK>")
    n_pages = len(pages)
    df = []
    curr_page=1
    doc_id = 0
    while (curr_page<=n_pages):
        if splits_dict.get((date, curr_page)):
            start_page = curr_page
            end_page, notes = splits_dict.get((date, curr_page))
            if not notes:
                content = "<PAGE BREAK>".join(pages[(start_page-1):(end_page)])
            else:
                content = f"SKIPPED ({notes})"
        else:
            start_page = curr_page
            end_page = curr_page
            content = pages[start_page-1]
        df.append({
            'year': year,
            'date': date,
            'doc_id': doc_id,
            'start_page': start_page,
            'end_page': end_page,
            'content': content
        })
        doc_id+=1
        curr_page = end_page+1
    df = pd.DataFrame.from_dict(df)
    print(f"{len(df)} documents extracted")
    return df

In [4]:
t0 = time.time()
years = sorted([d.name for d in DATA_DIR.iterdir() if d.is_dir()])
for year in years:
    path = Path(f'../../intermediate_data/cpc/{year}')
    dates = sorted([d.name for d in path.iterdir() if d.is_dir()])
    for date in dates:
        path = Path(f'../../intermediate_data/cpc/{year}/{date}')
        files = [f.name for f in path.iterdir()]
        if 'supplemental-docs.txt' in files:
            print(date)
            df = extract_contents(date)
            df.to_pickle(f"../../intermediate_data/cpc/{year}/{date}/supplemental-docs.pkl")
t1 = time.time()
print(f"Elapsed time = {(t1-t0)/60} mintues")

2018-05-10
26 documents extracted
2018-05-23
10 documents extracted
2018-06-14
23 documents extracted
2018-07-12
26 documents extracted
2018-07-26
42 documents extracted
2018-08-09
39 documents extracted
2018-08-23
6 documents extracted
2018-09-13
79 documents extracted
2018-09-27
13 documents extracted
2018-10-11
67 documents extracted
2018-10-25
53 documents extracted
2018-11-08
10 documents extracted
2018-11-29
16 documents extracted
2018-12-13
21 documents extracted
2018-12-20
10 documents extracted
2019-01-10
18 documents extracted
2019-01-24
40 documents extracted
2019-02-14
12 documents extracted
2019-02-28
5 documents extracted
2019-03-14
16 documents extracted
2019-03-28
22 documents extracted
2019-04-11
25 documents extracted
2019-05-09
25 documents extracted
2019-05-23
112 documents extracted
2019-06-13
5 documents extracted
2019-06-27
15 documents extracted
2019-07-11
70 documents extracted
2019-07-25
58 documents extracted
2019-08-08
79 documents extracted
2019-08-22
7 doc