In [4]:
# !pip install pdfplumber
# !pip install openai

In [5]:
import pdfplumber
from pathlib import Path

def extract_text_from_pdf(pdf_path: Path, output_txt_path: Path):
    """
    Extracts text from a PDF file and writes it to a text file,
    attempting to preserve the structure such as titles and paragraphs.

    Args:
        pdf_path (Path): Path to the input PDF file.
        output_txt_path (Path): Path to the output text file.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page_number, page in enumerate(pdf.pages, start=1):
                # Extract text from the current page
                text = page.extract_text()
                if text:
                    # Optionally, add page breaks or headers
                    full_text += f"\n\n--- Page {page_number} ---\n\n"
                    full_text += text
            # Write the extracted text to the output file
            output_txt_path.write_text(full_text, encoding='utf-8')
        print(f"Text successfully extracted to {output_txt_path}")
    except Exception as e:
        print(f"An error occurred while processing {pdf_path.name}: {e}")

def extract_text_from_pdfs_in_directory(directory_path: Path, output_directory: Path):
    """
    Extracts text from all PDF files in a specified directory.

    Args:
        directory_path (Path): Path to the directory containing PDF files.
        output_directory (Path): Path to the directory where text files will be saved.
    """
    # Ensure the output directory exists
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Iterate over all PDF files in the directory
    for pdf_file in directory_path.glob('*.pdf'):
        txt_filename = pdf_file.stem + '.txt'
        output_txt_path = output_directory / txt_filename
        extract_text_from_pdf(pdf_file, output_txt_path)

if __name__ == "__main__":
    # Example usage
    data_path = Path().cwd().parent / 'data'
    input_pdf_directory = data_path / 'raw'
    output_text_directory = data_path / 'processed'
    extract_text_from_pdfs_in_directory(input_pdf_directory, output_text_directory)


In [6]:

metadata_pdfs = {
    'nacional': {'filename': "plano-acao-adaptacao-climatica-nacional.pdf", 'start_page' : 6},
    'agro': {'filename' : "plano-acao-climatica-agro.pdf", 'start_page' : 20},
    'curitiba' : {'filename' : 'plano-acao-climatica-curitiba.pdf', "start_page" : 16},
    'federal' : {'filename': 'plano-acao-climatica-federal.pdf', 'start_page' : 31},
    'itabirito' : {'filename' : 'plano-acao-climatica-itabirito.pdf', "start_page" : 19},
    'joao_pessoa' : {'filename' : "plano-acao-climatica-joao-pessoa.pdf", "start_page" : 14},
    'sao_paulo' : {'filename' : "plano-acao-climatica-sp-regiao.pdf", "start_page" : 11}, 
    'enfretamento' : {'filename' : 'plano-enfrentamento-mudanca-climatica-nacional.pdf', "start_page" : 1} 
}

In [10]:
def extract_text_from_pdf(pdf_path: Path, output_txt_path: Path, start_page : int = 1):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page_number, page in enumerate(pdf.pages[start_page-1:], start=1):
                # Extract text from the current page
                text = page.extract_text()
                if text:
                    # Optionally, add page breaks or headers
                    full_text += f"\n\n--- Page {page_number} ---\n\n"
                    full_text += text
            # Write the extracted text to the output file
            output_txt_path.write_text(full_text, encoding='utf-8')
        print(f"Text successfully extracted to {output_txt_path}")
    except Exception as e:
        print(f"An error occurred while processing {pdf_path.name}: {e}")

In [11]:
data_path = Path().cwd().parent / 'data'
input_pdf_directory = data_path / 'raw'
output_text_directory = data_path / 'processed'

output_text_directory.mkdir(parents=True, exist_ok=True)

for nome, metadata in metadata_pdfs.items():
    txt_filename = metadata['filename'][:-4] + '.txt'
    output_txt_path = output_text_directory / txt_filename
    extract_text_from_pdf(input_pdf_directory / metadata['filename'], output_txt_path, start_page = metadata['start_page'])




Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-adaptacao-climatica-nacional.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-agro.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-curitiba.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-federal.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-itabirito.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-joao-pessoa.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-acao-climatica-sp-regiao.txt
Text successfully extracted to c:\Users\esdra\Documents\BCG_Challenge\data\processed\plano-enfrentamento-mudanca-climatica-nacional.txt
