In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
import openai
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
import pprint

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# Load the PDF document
loader = PyPDFLoader("example_data/sample_pdf.pdf")
pages = loader.load_and_split()

In [4]:
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
)
data = text_splitter.split_documents(pages)

In [5]:
data[1]

Document(page_content='2 UNDP Global Project Contribution \nPartners  \n \nIn 2022, the Global Project on Managing Development Co-operation  Effectively received support \nfrom Canada, the European Commission, Germany, the Republic of Korea, Sweden and Switzerland \nto implement the priorities outlined in the Global Project document and in line with the 2020 -2022 \nWork Programme of the Global Partn ership for Effective Development.  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nCopyright © 2023 United Nations Development Programme  \nAll rights reserved.  \nUNDP is the leading United Nations organization fighting to end the injustice of poverty, inequality, and climate \nchange. Working with our broad network of experts and partners in 170 countries, we help nations to build \nintegrated, lasting solutions for pe ople and planet. Learn more at undp.org or follow at @UNDP.  \nThis publication or parts of it may not be reproduced; stored by means of an

In [6]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [7]:
embeddings = OpenAIEmbeddings()
# Create a Chroma vector store and persist it
store = Chroma.from_documents(
    data,
    embeddings,
    ids=[f"{item.metadata['source']}-{index}" for index, item in enumerate(data)],
    collection_name="pdf-sample",
    persist_directory='db',
)
store.persist()

In [10]:
# Define a prompt template for the question-answering system
template = """
You are an AI assistant that answers questions about loaded PDF files, using only the provided context.
If you don't know the answer based on the given context, simply state that you don't have enough information to answer.

Context:
{context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"]
)

# Initialize the language model
llm = ChatOpenAI(temperature=0, model="gpt-4o-2024-05-13")

# Create a retrieval-based question-answering system
qa_with_source = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=store.as_retriever(),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
)

{'query': 'What is the main topic of the document?',
 'result': 'The main topic of the document is the Global Partnership for '
           'Effective Development Cooperation, focusing on key results '
           'achieved in 2022, challenges, and the way forward, as well as '
           'financial execution and donor information for that year.',
 'source_documents': [Document(page_content='1', metadata={'page': 0, 'source': 'example_data/sample_pdf.pdf'}),
                      Document(page_content='4 Table of Content  \n \n \n \nAcronyms   5 \nAbout the G lobal Partnership  \nfor Effective Development Cooperation   6 \nKey Results Achieved in 2022  8 \n▪ Output 1  8 \n▪ Output 2  14 \n▪ Output 3  21 \n▪ Output 4 and 5  27 \nChallenges and Way forward  34 \nAnnex I: Financial Project  Execution in 2022 (Provisional)  35 \nAnnex II: List of Donors in 2022  36 \nAnnex III: Result Framework and Financing report  37', metadata={'page': 3, 'source': 'example_data/sample_pdf.pdf'}),
       

In [9]:
# Ask a question and print the answer
question = "What is the main topic of the document?"
result = qa_with_source(question)
pprint.pprint(result)

{'query': 'What is the main topic of the document?',
 'result': 'The main topic of the document is the Global Partnership for '
           'Effective Development Cooperation, focusing on key results '
           'achieved in 2022, challenges, and the way forward, as well as '
           'financial execution and donor information for that year.',
 'source_documents': [Document(page_content='1', metadata={'page': 0, 'source': 'example_data/sample_pdf.pdf'}),
                      Document(page_content='4 Table of Content  \n \n \n \nAcronyms   5 \nAbout the G lobal Partnership  \nfor Effective Development Cooperation   6 \nKey Results Achieved in 2022  8 \n▪ Output 1  8 \n▪ Output 2  14 \n▪ Output 3  21 \n▪ Output 4 and 5  27 \nChallenges and Way forward  34 \nAnnex I: Financial Project  Execution in 2022 (Provisional)  35 \nAnnex II: List of Donors in 2022  36 \nAnnex III: Result Framework and Financing report  37', metadata={'page': 3, 'source': 'example_data/sample_pdf.pdf'}),
       