# PDF text extraction

**Description**: This notebook demonstrates how to extract text from a PDF file using the `PyPDFium2` library.

## Imports

In [None]:
import pypdfium2 as pdfium

from src.pdf_reader.helpers import detect_header_footer

## Load PDF document

In [None]:
pdf_path = "data/pdf_docs/a-practical-guide-to-building-agents.pdf"

In [None]:
pdf = pdfium.PdfDocument(pdf_path)
print(f"Length of PDF: {len(pdf)} pages")

## Detect header and footer

In [None]:
header_footer_lines = detect_header_footer(document=pdf)
list(header_footer_lines)[:5]

## Extract text from document pages

In [None]:
text_per_page = dict()

for page_id in range(len(pdf)):
    print("---------------------------------------------")
    print(f"Page {page_id + 1} of {len(pdf)}")
    # It seems that the package "pypdfium2" separates lines by "\r\n" by default
    page_text = pdf[page_id].get_textpage().get_text_bounded()

    # Split the text into lines removing the ones contained in the header and footer
    page_lines = [
        line
        for line in page_text.splitlines()
        if line.strip() not in header_footer_lines
    ]

    page_text_without_header_footer = "\n".join(page_lines)
    print(page_text_without_header_footer)

    text_per_page[page_id] = page_text_without_header_footer