# Evaluating RAG pipeline with LangSmith metrics

This is a tutorial for evaluating RAG pipeline using custom metrics

In [1]:
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" # Update appropriately for self-hosted installations or the EU region
_set_env("LANGCHAIN_API_KEY")

# Clone a test dataset from LangSmith public hub

In [3]:
### Dataset name
from langsmith import Client

# Clone dataset
client = Client()
dataset = client.clone_public_dataset(
    "https://smith.langchain.com/public/730d833b-74da-43e2-a614-4e2ca2502606/d"
)

dataset_name = "LCEL-QA"

# Cohere RAG pipeline

In [None]:
import requests
import cohere
import os
os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')

# Define the API endpoint for streaming
url = "http://localhost:8000/v1/chat"
bearer = os.getenv('BEARER_SECRET_KEY')

# Set headers
headers = {
    "User-Id": "me",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {bearer}",
    "Cohere-Stream": "true",  # Enable streaming for chatbot responses
}

# Set the message to send
message = "What is Tech Innovators Inc.'s approach to workplace harassment?"

# Create the payload as a JSON dictionary
data = {"message": message}

# Send the POST request using requests
response = requests.post(url, headers=headers, json=data)

# Check for successful response
if response.status_code == 200:
  # Handle streaming response
  for line in response.iter_lines():
    # Decode the response (if necessary)
    decoded_line = line.decode("utf-8")
    # Process the received data from the stream (print it here)
    print(decoded_line)
else:
  print(f"Error: {response.status_code}")

# Create a simple RAG pipeline (For test purpose only!)

This RAG pipeline was created to test LangSmith evaluation tools. Do not use if you are able to access Cohere toolkit backend.

## Create a sample vector store and retriever

In [4]:
### INDEX

from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load docs
url = "https://python.langchain.com/v0.1/docs/expression_language/"
loader = RecursiveUrlLoader(
    url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed and store in Chroma
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Index
retriever = vectorstore.as_retriever()

## RAG pipeline

In [5]:
### RAG bot

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4-0125-preview"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)

In [6]:
# Testing out the RAG chatbot
response = rag_bot.get_answer("How to build a RAG chain in LCEL?")
response["answer"][:150]

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


'To build a RAG (Retrieve and Generate) chain in LCEL (Language Chain Expression Language), you would typically follow these steps, assuming I provide '

# Create functions that use the RAG pipeline to generate responses

In [7]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

# Create a custom RAG evaluator

Ref: https://docs.smith.langchain.com/tutorials/Developers/rag

## Evaluator
There are at least 4 types of RAG eval that users are typically interested in.

**Response vs reference answer** (RAGAS metric: answer_correctness)
* Goal: Measure "how similar/correct is the RAG chain answer, relative to a ground-truth answer"
* Mode: Uses ground truth (reference) answer supplied through a dataset
* Judge: Use LLM-as-judge to assess answer correctness.

**Response vs input** (RAGAS metric: answer_relevancy)
* Goal: Measure "how well does the generated response address the initial user input"
* Mode: Reference-free, because it will compare the answer to the input question
* Judge: Use LLM-as-judge to assess answer relevance, helpfulness, etc.

**Response vs retrieved docs** (RAGAS metric: faithfulness)
* Goal: Measure "to what extent does the generated response agree with the retrieved context"
* Mode: Reference-free, because it will compare the answer to the retrieved context
* Judge: Use LLM-as-judge to assess faithfulness, hallucinations, etc.

**Retrieved docs vs input** (RAGAS metric: context_utilization)
* Goal: Measure "how good are my retrieved results for this query"
* Mode: Reference-free, because it will compare the question to the retrieved context
* Judge: Use LLM-as-judge to assess relevance

## Response vs reference answer: Answer Accuracy

In [8]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    reference = example.outputs["output_answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

  prompt = loads(json.dumps(prompt_object.manifest))


## Response vs Input: Answer Helpfulness

In [None]:
# Grade prompt
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_helpfulness_score", "score": score}

## Response vs retrieved docs: Answer Hallucination

In [None]:
# Prompt
grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["input_question"]
    contexts = run.outputs["contexts"]

    # RAG answer
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

## Retrieved docs vs input: Retrieval Relevance

In [None]:
# Grade prompt
grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")

def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """

    # RAG inputs
    input_question = example.inputs["input_question"]
    contexts = run.outputs["contexts"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]

    return {"key": "document_relevance", "score": score}

# Evaluate RAG pipeline on LangSmith

In [9]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="rag-answer-v-reference",
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)

View the evaluation results for experiment: 'rag-answer-v-reference-7e86de62' at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/f6d5b279-948b-4574-b755-48a103001f08/compare?selectedSessions=201c2c91-1fd9-4b6c-8f48-1c1fcf928be8




0it [00:00, ?it/s]

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
