In [5]:
!pip install pdfplumber
!pip install openai

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Downloading pillow-10.4.0-cp312-cp312-win_amd64.whl.metadata (9.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl.metadata (34 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading cryptography-43.0.1-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading cffi-1.17.1-cp312-cp312-win_amd64.whl.metadata (1.6 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplu

In [8]:
import pdfplumber
from pathlib import Path

def extract_text_from_pdf(pdf_path: Path, output_txt_path: Path):
    """
    Extracts text from a PDF file and writes it to a text file,
    attempting to preserve the structure such as titles and paragraphs.

    Args:
        pdf_path (Path): Path to the input PDF file.
        output_txt_path (Path): Path to the output text file.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page_number, page in enumerate(pdf.pages, start=1):
                # Extract text from the current page
                text = page.extract_text()
                if text:
                    # Optionally, add page breaks or headers
                    full_text += f"\n\n--- Page {page_number} ---\n\n"
                    full_text += text
            # Write the extracted text to the output file
            output_txt_path.write_text(full_text, encoding='utf-8')
        print(f"Text successfully extracted to {output_txt_path}")
    except Exception as e:
        print(f"An error occurred while processing {pdf_path.name}: {e}")

def extract_text_from_pdfs_in_directory(directory_path: Path, output_directory: Path):
    """
    Extracts text from all PDF files in a specified directory.

    Args:
        directory_path (Path): Path to the directory containing PDF files.
        output_directory (Path): Path to the directory where text files will be saved.
    """
    # Ensure the output directory exists
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Iterate over all PDF files in the directory
    for pdf_file in directory_path.glob('*.pdf'):
        txt_filename = pdf_file.stem + '.txt'
        output_txt_path = output_directory / txt_filename
        extract_text_from_pdf(pdf_file, output_txt_path)

if __name__ == "__main__":
    # Example usage
    data_path = Path().cwd().parent / 'data'
    input_pdf_directory = data_path / 'raw'
    output_text_directory = data_path / 'processed'
    extract_text_from_pdfs_in_directory(input_pdf_directory, output_text_directory)


Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-adaptacao-climatica-nacional.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-agro.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-curitiba.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-federal.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-itabirito.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-joao-pessoa.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-sp-regiao.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-enfrentamento-mudanca-climatica-nacional.txt
