In [None]:
import pypdf

def ingest_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [11]:
text = ingest_pdf("sample.pdf")
print(text)

Sample PDFThis is a simple PDF ﬁle. Fun fun fun.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. 
Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget 
pharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. 
Integer a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. 
Vestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla 
erat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. 
Vivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique 
accumsan eros quam et risus. Suspendisse libero odio, mattis sit amet, aliquet eget, 
hendrerit vel, nulla. Sed vitae augue. Aliquam erat volutpat. Aliquam feugiat vulputate nisl. 
Suspendisse quis nulla pretium ante pretium mollis. Proin velit ligula, sagittis at, egestas a, 
pulvinar quis, nisl.
Pellentesque sit amet lectus. P

In [8]:
import os
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document

def load_document(file_path: Path) -> List[Document]:
    """
    Extract and chunk text from pdf file at the given path.

    Args:
        file_path (Path): Path to the pdf file.

    Raises:
        FileNotFoundError: if the file does not exist.
        NotImplementedError: if the file extension is not pdf.

    Returns:
        List[Document]: A list of Document objects containing chunks of the extracted text.
            Each document has the following attributes: metadata (dict), page_content (str).
    """
    if not file_path.is_file():
        raise FileNotFoundError(f"File not found: {file_path}")
    elif file_path.suffix == ".pdf":
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split(
            text_splitter=RecursiveCharacterTextSplitter(
                chunk_size=3000,
                chunk_overlap=300,
            )
        )
        return pages
    else:
        raise NotImplementedError(f"File type not supported: {file_path}")

In [9]:
pages = load_document(Path("sample.pdf"))
len(pages)

1

In [10]:
pages[0].page_content

'Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\nLorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. \nCurabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget \npharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. \nInteger a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. \nVestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla \nerat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. \nVivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique \naccumsan eros quam et risus. Suspendisse libero odio, mattis sit amet, aliquet eget, \nhendrerit vel, nulla. Sed vitae augue. Aliquam erat volutpat. Aliquam feugiat vulputate nisl. \nSuspendisse quis nulla pretium ante pretium mollis. Proin velit ligula, sagittis at, egestas a, \npulvinar quis, nisl.\nPellentesque sit a