## 논문 삽입과 질의응답 시스템

문서 올리기

In [3]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../project/example_data/2408.00714v1.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

# print(len(docs))

RAG로 질의응답

In [2]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI


# llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")


In [4]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [27]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Give me some key points of the paper."})

results

In [28]:
results['answer']

'The paper acknowledges the contributions of various individuals involved in data support, annotation engineering, management, compute support, and project-level support. There is no plan to validate or verify further annotations for SA-V, but annotations were reviewed for quality assurance during training and production stages. Sociodemographic characteristics were not used to select annotators for masklet annotations, and diversity among crowdworkers was encouraged during video collection. The final labels in the dataset are a result of data cleaning and post-processing from individual annotator responses.'

## LLM앱으로 만들기

In [62]:
print("Welcome to the paperbot. If you want to quit, please enter 'exit'.")
while True:
    # Input
    user_input = input("User: ")

    # 종료 입력하면 대화 종료
    if user_input.lower() == "exit":
        print("Thank you.")
        break

    # 응답 생성 및 출력
    results = rag_chain.invoke({"input": user_input})
    print(f"User: {user_input}")
    print(f"Assitant: {results['answer']}")

Welcome to the paperbot. If you want to quit, please enter 'exit'.
User: 
Assitant: I'm sorry, but I can't see a question in your message. How can I assist you today?
Thank you.


## 서버 실행

In [67]:
!python ./serve.py

[32mINFO[0m:     Started server process [[36m59339[0m]
[32mINFO[0m:     Waiting for application startup.

 __          ___      .__   __.   _______      _______. _______ .______     ____    ____  _______
|  |        /   \     |  \ |  |  /  _____|    /       ||   ____||   _  \    \   \  /   / |   ____|
|  |       /  ^  \    |   \|  | |  |  __     |   (----`|  |__   |  |_)  |    \   \/   /  |  |__
|  |      /  /_\  \   |  . `  | |  | |_ |     \   \    |   __|  |      /      \      /   |   __|
|  `----./  _____  \  |  |\   | |  |__| | .----)   |   |  |____ |  |\  \----.  \    /    |  |____
|_______/__/     \__\ |__| \__|  \______| |_______/    |_______|| _| `._____|   \__/     |_______|

[1;32;40mLANGSERVE:[0m Playground for chain "/chat/" is live at:
[1;32;40mLANGSERVE:[0m  │
[1;32;40mLANGSERVE:[0m  └──> /chat/playground/
[1;32;40mLANGSERVE:[0m
[1;32;40mLANGSERVE:[0m See all available routes at /docs/

[1;31;40mLANGSERVE:[0m ⚠️ Using pydantic 2.0.2. OpenAPI docs for inv