# 랭체인을 활용한 PDF 삽입과 질의응답 시스템

## 문서 올리기

In [9]:
!pip install langchain
!pip install -qU pypdf langchain_community



In [1]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../project/example_data/2408.00714v1.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

41


In [2]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

SAM 2: Segment Anything in Images and Videos
Nikhila Ravi∗,†,Valentin Gabeur∗,Yuan-Ting Hu∗,Ronghang
{'source': '../project/example_data/2408.00714v1.pdf', 'page': 0}


## RAG로 질의응답

In [12]:
!pip install -qU langchain-openai

In [3]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [15]:
!pip install langchain_chroma langchain_openai

Collecting langchain_chroma
  Using cached langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Using cached langchain_chroma-0.1.2-py3-none-any.whl (9.3 kB)
Installing collected packages: langchain_chroma
Successfully installed langchain_chroma-0.1.2


In [4]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [5]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Where is the code of the paper?"})

results

{'input': 'Where is the code of the paper?',
 'context': [Document(metadata={'page': 14, 'source': '../project/example_data/2408.00714v1.pdf'}, page_content='Martinez Doehner and Baishan Guo for data support, and to our annotation engineering and management\npartners: Robert Kuo, Rishi Godugu, Bob Kamma, Ida Cheng, Claudette Ward, Kai Brown, Jake Kinney,\nJenny Truong, and Karen Bergan. Thanks to Vispi Cassod, Parth Malani, Shiva Koduvayur, Alexander\nMiller, and Caleb Ho for their support with compute and infra. Finally, we thank Azita Shokrpour, Mallika\nMalhotra, Rodrick Shepard, Jonathan Torres, Luc Dahlin, David Soofian, Alex Bosenberg, and Amanda\nKallet for project-level support.\n15'),
  Document(metadata={'page': 33, 'source': '../project/example_data/2408.00714v1.pdf'}, page_content='(DOI)? The dataset is available at https://ai.meta.com/datasets/segment-anything-video/.\n3.When will the dataset be distributed? The dataset will be distributed in July 2024.\n4.Will the dataset

In [6]:
print(results["context"][0].page_content)

Martinez Doehner and Baishan Guo for data support, and to our annotation engineering and management
partners: Robert Kuo, Rishi Godugu, Bob Kamma, Ida Cheng, Claudette Ward, Kai Brown, Jake Kinney,
Jenny Truong, and Karen Bergan. Thanks to Vispi Cassod, Parth Malani, Shiva Koduvayur, Alexander
Miller, and Caleb Ho for their support with compute and infra. Finally, we thank Azita Shokrpour, Mallika
Malhotra, Rodrick Shepard, Jonathan Torres, Luc Dahlin, David Soofian, Alex Bosenberg, and Amanda
Kallet for project-level support.
15


In [7]:
print(results["context"][0].metadata)

{'page': 14, 'source': '../project/example_data/2408.00714v1.pdf'}
