In [None]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations

In [None]:
# Directory to pdf files:
DATA_PATH = '/data/'
def load_documents():
    """
    Load PDF Docs from specified directory in DATA_PATH
    return:
    Loaded PDF represented as Langchain Document objects
    """

    # Initialize PDF loader with specified directory
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    # Load PDF Docs and return them as a list of Document objects
    return document_loader.load()

documents = load_documents()
# Inspect contents of the first document as well as the metadata
print(documents[0])

In [None]:
def split_text_into_chunks(documents: list[Document]):
    """
    Split text content of given list of Documents into smaller chunks
    args:
    document (list[Document]): List of Document objects containing text content
    return:
    list[Document]: List of Document objects representing the split chunks
    """

    # Initialize text splitter with following parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300, # Size of each chunk in characters
        chunk_overlap=100, # Overlap between consecutive chunks
        length_function=len, # Function to compute length of given text
        add_start_index=True # Flag to add start index to each chunk
    )

    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks")

    # Print example of page content and metadata for a chunk
    document = chunks[0]
    print(f"Example of chunk: \n {document.page_content} \n \n {document.metadata}")

    return chunks