# Overview
1. Creating a dataset with questions and their expected answers(Omitted)
2. Running your RAG application on those questions
3. Using evaluators to measure how well your application performed, looking at factors like:
    - Answer relevance
    - Answer accuracy
    - Retrieval quality

# Setup

### Environment

In [1]:
import os
from getpass import getpass

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass("LANGSMITH_API_KEY")
os.environ["OPENAI_API_KEY"] = getpass("OPENAI_API_KEY")

### Applicaton

In [2]:
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools.retriever import create_retriever_tool
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain_community.document_loaders import PyPDFLoader
from langchain import hub

In [3]:
file_path = "./temp.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()



### 대형언어모델, 메모리, 파서 설정 ###
llm = ChatOpenAI(model="gpt-4o")

### 리트리버 생성 ###
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings())


retriever = vectorstore.as_retriever()


In [4]:
### 리트리버툴 구축 ###
tool = create_retriever_tool(
    retriever,
    "paper_retriever",
    "Search for information about a paper. For any questions about the paper, you must use this tool!",
)
tools = [tool]

# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")

agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools)


In [12]:
from langsmith import traceable

# Add decorator so this function is traced in LangSmith
@traceable()
def rag_bot(question: str) -> dict:
    # langchain ChatModel will be automatically traced
    ai_msg = agent_executor.invoke({"input": question})

    return {"answer": ai_msg["output"]}

# Dataset

In [None]:
from langsmith import Client

client = Client()

# Define the examples for the dataset

examples = [
  {
    "inputs": {"question": "What is SAM 2's primary capability?"},
    "outputs": {"answer": "Promptable visual segmentation in both images and videos"}
  },
  {
    "inputs": {"question": "How much faster is SAM 2 than SAM for image segmentation?"},
    "outputs": {"answer": "6× faster while being more accurate"}
  },
  {
    "inputs": {"question": "What types of prompts does SAM 2 accept?"},
    "outputs": {"answer": "Points, boxes, and masks on individual frames"}
  },
  {
    "inputs": {"question": "What is the key architectural innovation for video processing?"},
    "outputs": {"answer": "Transformer with streaming memory and memory bank"}
  },
  {
    "inputs": {"question": "How many videos are in the SA-V dataset?"},
    "outputs": {"answer": "50.9K videos with 642.6K masklets"}
  },
  {
    "inputs": {"question": "What is SAM 2's performance on DAVIS 2017 val set?"},
    "outputs": {"answer": "90.9 𝒥&ℱ with Hiera-B+ encoder"}
  },
  {
    "inputs": {"question": "How does SAM 2 handle multiple objects in video?"},
    "outputs": {"answer": "Processes each object separately without inter-object communication"}
  },
  {
    "inputs": {"question": "What's the main limitation mentioned?"},
    "outputs": {"answer": "Difficulty with shot changes and crowded scenes"}
  },
  {
    "inputs": {"question": "What future improvement is suggested?"},
    "outputs": {"answer": "Incorporating explicit motion modeling"}
  },
  {
    "inputs": {"question": "What's the CO2 emission equivalent of training?"},
    "outputs": {"answer": "~3.89 metric tons (equivalent to 10k miles driven)"}
  }
]

# Create the dataset and examples in LangSmith
dataset_name = "SAM2 Q&A"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=examples
)

# Evaluator
### Correctness

In [10]:
from typing_extensions import Annotated, TypedDict

# Grade output schema
class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
    QUESTION: {inputs['question']}
    GROUND TRUTH ANSWER: {reference_outputs['answer']}
    STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions}, 
        {"role": "user", "content": answers}
    ])
    return grade["correct"]

In [13]:
def target(inputs: dict) -> dict:
    return rag_bot(inputs["question"])


experiment_results = client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness],
    experiment_prefix="rag-correctedness",
    metadata={"version": "LCEL context, gpt-4o"},
)


View the evaluation results for experiment: 'rag-correctedness-7c3179a6' at:
https://smith.langchain.com/o/61d90813-dbd8-5ea4-8a7a-14aa80a5a455/datasets/ab8ab590-4128-4f5d-8a8d-31b1097c313e/compare?selectedSessions=f904ed95-5a0e-4bc9-ac3e-dd22fa8e93d0




0it [00:00, ?it/s]