In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
import openai
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
import pprint

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# Load the PDF document
loader = PyPDFLoader("example_data/sample_pdf.pdf")
pages = loader.load_and_split()

In [None]:
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
)
data = text_splitter.split_documents(pages)

In [None]:
data[1]

In [None]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [None]:
embeddings = OpenAIEmbeddings()
# Create a Chroma vector store and persist it
store = Chroma.from_documents(
    data,
    embeddings,
    ids=[f"{item.metadata['source']}-{index}" for index, item in enumerate(data)],
    collection_name="pdf-sample",
    persist_directory='db',
)
store.persist()

In [None]:
# Define a prompt template for the question-answering system
template = """
You are an AI assistant that answers questions about loaded PDF files, using only the provided context.
If you don't know the answer based on the given context, simply state that you don't have enough information to answer.

Context:
{context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"]
)

# Initialize the language model
llm = ChatOpenAI(temperature=0, model="gpt-4o-2024-05-13")

# Create a retrieval-based question-answering system
qa_with_source = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=store.as_retriever(),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
)

In [None]:
# Ask a question and print the answer
question = "What is the main topic of the document?"
result = qa_with_source(question)
pprint.pprint(result)