In [1]:
import os
import sys
import re
import time
import yaml
from pathlib import Path
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import warnings
import logging
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

with open('../../config.local.yaml', 'r') as f:
    local_config = yaml.safe_load(f)

LOCAL_PATH = local_config['LOCAL_PATH']
TESSERACT = local_config['TESSERACT']

sys.path.append(os.path.join(LOCAL_PATH, "src/python"))
pytesseract.pytesseract.tesseract_cmd = TESSERACT

warnings.filterwarnings('default')
logging.getLogger("pdfminer").setLevel(logging.ERROR)

DATA_DIR = Path(os.path.join(LOCAL_PATH, "raw_data/cpc"))

REPLACE = False  # whether to replace existing txt files

In [2]:
def extract_text_from_pdf(pdf_path, verbose=True, convert_images=True):
    text_pieces = []
    pages=0
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            pages+=1
            page_text = page.extract_text(x_tolerance=1, layout=True)
            if page_text:
                text_pieces.append(page_text)
            elif convert_images and (page.width/72<17) and (page.height/72<22):
                images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
                ocr_text = pytesseract.image_to_string(images[0])
                if ocr_text:
                    text_pieces.append(ocr_text.replace('|','I'))
                else:
                    text_pieces.append('NO TEXT')
            else:
                text_pieces.append('NO TEXT')
            if verbose and (pages%100==0):
                print(f"{pages}... ", end='')
    if verbose:
        print(f"{pages} pages extracted.")
    return "\n<PAGE BREAK>\n".join(text_pieces)

In [3]:
meta_df = []

t0 = time.time()
years = sorted([d.name for d in DATA_DIR.iterdir() if d.is_dir()])
for year in years:
    year_dir = Path(os.path.join(DATA_DIR, str(year)))
    dates = sorted([d.name for d in year_dir.iterdir() if d.is_dir()])
    for date in dates:
        print(date)
        output_dir = Path(os.path.join(LOCAL_PATH, f"intermediate_data/cpc/{year}/{date}"))
        output_dir.mkdir(parents=True, exist_ok=True)
        pdf_dir = Path(os.path.join(year_dir, date))
        files = [f.name for f in pdf_dir.iterdir()]
        for file in files:
            print(f"  {file}: ", end='')
            filename, ext = file.split('.')
            if ext!='pdf':
                continue
            if os.path.exists(os.path.join(pdf_dir, f"{filename}-override.pdf")):
                pdf_file = os.path.join(pdf_dir, f"{filename}-override.pdf")
            else:
                pdf_file = os.path.join(pdf_dir, f"{filename}.pdf")
            output_file = os.path.join(output_dir, f"{filename}.txt")
            if (not REPLACE) and (os.path.exists(output_file)):
                with open(output_file, 'r', encoding='utf-8') as f:
                    text = f.read()
                print(f"already extracted")
            else:
                text = extract_text_from_pdf(pdf_file)
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(text)
            pages = len(text.split("<PAGE BREAK>"))
            meta_df.append({
                'year': year,
                'date': date,
                'filename': filename,
                'pages': pages
            })
t1 = time.time()
print(f"Elapsed time = {(t1-t0)/60} mintues")

2003-01-09
  agenda.pdf: already extracted
2003-01-23
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-02-13
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-02-27
  agenda.pdf: already extracted
2003-03-13
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-03-27
  agenda.pdf: already extracted
2003-04-10
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-04-24
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-05-08
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-05-22
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-06-12
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-06-26
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-07-10
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-07-24
  agenda.pdf: already extracted
  audio.pdf: already extracted
2003-08-14
  agenda.pdf: already extracted
  audio.pdf: a

In [4]:
os.path.join(DATA_DIR, str(year))

'C:\\Users\\edwar\\projects\\lur-lm\\raw_data\\cpc\\2025'

In [6]:
meta_df = pd.DataFrame.from_dict(meta_df)
meta_df.to_csv(os.path.join(LOCAL_PATH, "intermediate_data/cpc/metadata.csv"), header=True, index=False)