## Agent Evaluation

Evaluating an agent requires a thorough check at the agent's entire lifecycle, from development to deployment. Some important questions need to be addressed: 

* **Final output:** Is the agent responding in a factual and genuine manner? 
* **Reasoning Process:** Is the agent using correct tool and following proper logic with efficiency to reach the solution? 
* **Structural Integrity:** Are the generated responses precise, structured (eg. JSON), and reliable with regards to the tools and APIs? 
* **Conversational Skill:** Is the generated response realistic, multi-turn dialog without losing the context or getting confused? 
* **Live Feedback:** Is the quality of agent holding up over time with real, unpredictable user traffic, and can we monitor it to catch errors? 

In [7]:
# Required libraries
# pip install langchain langsmith

### Initialise the APIs

In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import langsmith
import os

load_dotenv()

True

In [2]:
# Initialise the LangSmith client
client = langsmith.Client()

### 1. Exact-Match based Evaluation

It is one of the fundamental evaluation methods, where we check if the model's output is identical to a predefined correct answer. 

In [None]:
# We need to create a dataset which will serve as a container for our Q-A examples. 
ds = client.create_dataset(
    dataset_name = "exact_match_dataset", 
    description = "A dataset for simple exact match evaluation"
)

# After creating the dataset, we need to put the examples which is the input/output dictionary 
# The inputs and outputs are provided in separate lists, maintaining the same order. 
client.create_examples(
    # List of inputs where each input is a dictionary
    inputs = [
        {
            "prompt_template": "When was artificial intelligence word coined?"
        },
        {
            "prompt_template": "When did ChatGPT launch?"
        }
    ],
    outputs = [
        {"output": "1956"},
        {"output": "2022"}
    ],
    dataset_id = ds.id
)

In [4]:
# Define the model that you need to test
model = "gpt-4o"

def predict_results(input_: dict) -> dict:
    # The input dictionary will have the key prompt_template which matches the key in dataset's input
    prompt = input_["prompt_template"]
    # Initialise and call the OpenAI model
    response = ChatOpenAI(model = model, temperature = 0).invoke(prompt)
    # The output key "output" needs to match with the dataset's output
    return {"output": response.content}


#### Building our custom evaluator

In [5]:
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def compare_label(run, example) -> EvaluationResult: 
    """
    A custom evaluator that checks for an exact match
    
    Args: 
        run: The LangSmith run object, which contains the model's outputs.
        example: The LangSmith example object, which contains the reference data. 
    
    Returns: 
        An EvaluationResult object with a key and a score. 
    """
    
    # Get the model's prediction from run's output dictionary
    prediction = run.outputs.get("output") or ""
    # Get the reference answer from the example's output dictionary
    target = example.outputs.get("output") or ""
    # Perform the comparison
    match = prediction == target
    # Return the result 
    return EvaluationResult(key = "matches_label", score = int(match))

In [6]:
from langchain.smith import RunEvalConfig

eval_config = RunEvalConfig(
    evaluators = ["exact_match"],
    custom_evaluators = [compare_label]
)

client.run_on_dataset(
    dataset_name = "exact_match_dataset",
    llm_or_chain_factory = predict_results, 
    evaluation = eval_config, 
    verbose = True
)

View the evaluation results for project 'puzzled-train-95' at:
https://smith.langchain.com/datasets/592d850b-827d-444f-8586-4fb434f6dcc8/compare?selectedSessions=7eebe4d3-c3d5-4c16-a0d2-32cff635b9f1

View all tests for Dataset exact_match_dataset at:
https://smith.langchain.com/datasets/592d850b-827d-444f-8586-4fb434f6dcc8
[------------------------------------------------->] 2/2

Unnamed: 0,feedback.exact_match,feedback.matches_label,error,execution_time,run_id
count,2.0,2.0,0.0,2.0,2
unique,,,0.0,,2
top,,,,,c595abad-b204-4248-aa22-c8eb706378f8
freq,,,,,1
mean,0.0,0.0,,2.040328,
std,0.0,0.0,,0.366433,
min,0.0,0.0,,1.781221,
25%,0.0,0.0,,1.910775,
50%,0.0,0.0,,2.040328,
75%,0.0,0.0,,2.169881,


{'project_name': 'puzzled-train-95',
 'results': {'c6ff06c9-fe9e-4ef6-9678-602f3361ace6': {'input': {'prompt_template': 'When did ChatGPT launch?'},
   'feedback': [EvaluationResult(key='exact_match', score=0, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('6682dedd-aee8-4e8c-92dc-582d18ecbab0'))}, feedback_config=None, source_run_id=None, target_run_id=None, extra=None),
    EvaluationResult(key='matches_label', score=0, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('651cda88-f5c6-4fc4-a46d-7dc9ed67c051'), target_run_id=None, extra=None)],
   'execution_time': 1.781221,
   'run_id': 'c595abad-b204-4248-aa22-c8eb706378f8',
   'output': {'output': 'ChatGPT was launched by OpenAI in November 2022.'},
   'reference': {'output': '2022'}},
  'd9905c96-54be-4f88-943f-16f03fe6ea75': {'input': {'prompt_template': 'When was artificial intelligence word coined?'},
   'feedback': [EvaluationResult(ke

### Unstructured QA Evaluation

In [33]:
# Create the dataset in LangSmith
ds1 = client.create_dataset(
    dataset_name = "unstruct_qa_evals",
    description = "Q&A dataset about LangSmith documentation."
)

# These are our question-and-answer examples. The answers serve as 'ground truth'.
qa_examples = [
    (
        "What is LangChain?",
        "LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.",
    ),
    (
        "What's a langsmith dataset?",
        "A LangSmith dataset is a collection of examples. Each example contains inputs and optional expected outputs or references for that data point.",
    ),
]

# Add the examples to our dataset
# The input key is 'question' and the output key is 'answer'.
# These keys must match what our RAG chain expects and produces.
for question, answer in qa_examples:
    client.create_example(
        inputs = {"question": question},
        outputs = {"answer": answer},
        dataset_id = ds1.id,
    )

In [34]:
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import TextSplitter, TokenTextSplitter
from langchain_openai import OpenAIEmbeddings

# Load the document from web
api_loader = RecursiveUrlLoader("https://docs.langchain.com/langsmith")
raw_documents = api_loader.load()

# Transform HTML raw data to clean text and split into chunks
doc_transformer = Html2TextTransformer()
transformed = doc_transformer.transform_documents(raw_documents)
text_splitter = TokenTextSplitter(model_name = "gpt-4o", chunk_size = 2000, chunk_overlap = 200)
documents = text_splitter.split_documents(transformed)

# Create a vector store retriever
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever(search_kwargs = {"k": 4})

In [42]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

# Define the prompt template that will be sent to the LLM.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful documentation Q&A assistant, trained to answer"
            " questions from LangSmith's documentation."
            " LangChain is a framework for building applications using large language models."
            "\nThe current time is {time}.\n\nRelevant documents will be retrieved in the following messages.",
        ),
        ("system", "{{context}}"), 
        ("human", "{question}"),  
    ]
).partial(time=str(datetime.now()))

# Initialize the LLM. We use a model with a large context window and low temperature for more factual responses.
model = ChatOpenAI(model="gpt-4o", temperature=0)

# Define the generation chain. It pipes the prompt to the model and then to an output parser.
rag_chain = prompt | model | StrOutputParser()

In [43]:
# Configure the evaluation to use the qa evaluator for grading correctness
eval_config = RunEvalConfig(
    evaluators = ["qa"]
)

client.run_on_dataset(
    dataset_name = "unstruct_qa_evals", 
    llm_or_chain_factory = rag_chain, 
    evaluation = eval_config, 
    verbose = True
)

View the evaluation results for project 'whispered-taste-64' at:
https://smith.langchain.com/o/bed0d9b9-d8f5-5181-ba80-dac89a730fec/datasets/a4b58ebc-7b6c-42af-9baf-a2d975e890c8/compare?selectedSessions=9b3060c0-5bfc-4702-a99b-68c11c7edcc0

View all tests for Dataset unstruct_qa_evals at:
https://smith.langchain.com/o/bed0d9b9-d8f5-5181-ba80-dac89a730fec/datasets/a4b58ebc-7b6c-42af-9baf-a2d975e890c8
[------------------------------------------------->] 2/2

Unnamed: 0,feedback.correctness,error,execution_time,run_id
count,2.0,0.0,2.0,2
unique,,0.0,,2
top,,,,b7d559b8-b746-4d5e-a901-cc00c1de73d9
freq,,,,1
mean,1.0,,4.035957,
std,0.0,,3.035365,
min,1.0,,1.88963,
25%,1.0,,2.962794,
50%,1.0,,4.035957,
75%,1.0,,5.10912,


{'project_name': 'whispered-taste-64',
 'results': {'c1a04913-6e28-439f-9795-d03cbb4f3fc6': {'input': {'question': "What's a langsmith dataset?"},
   'feedback': [EvaluationResult(key='correctness', score=1, value='CORRECT', comment='CORRECT', correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('4c81e3ac-94a9-41c7-9f32-ee44e2ab839f'))}, feedback_config=None, source_run_id=None, target_run_id=None, extra=None)],
   'execution_time': 6.182284,
   'run_id': 'b7d559b8-b746-4d5e-a901-cc00c1de73d9',
   'output': 'A LangSmith dataset is a collection of data used within the LangChain framework to facilitate the training, evaluation, and testing of language models. These datasets can include various types of data, such as text, images, or other media, and are used to improve the performance and accuracy of language models by providing them with relevant and diverse examples. LangSmith datasets are typically organized and managed to ensure they are easily accessible and usable for diff

#### Modifying the prompt and re-running the evaluation

In [44]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

# Define the prompt template that will be sent to the LLM.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful documentation Q&A assistant, trained to answer"
            " questions from LangSmith's documentation."
            " LangChain is a framework for building applications using large language models."
            "\nThe current time is {time}.\n\nRelevant documents will be retrieved in the following messages.",
        ),
        ("system", "{{context}}"), 
        ("human", "{question}"), 
        (
            "system",
            "Respond as best as you can. if no documents are retrieved or if you cannot find an answer in the retrieved documents,"
            "admit you do not know or that you can't find the supported results."
        ) 
    ]
).partial(time=str(datetime.now()))

# Initialize the LLM. We use a model with a large context window and low temperature for more factual responses.
model = ChatOpenAI(model="gpt-4o", temperature=0)

# Define the generation chain. It pipes the prompt to the model and then to an output parser.
rag_chain = prompt | model | StrOutputParser()

In [45]:
# Configure the evaluation to use the qa evaluator for grading correctness
eval_config = RunEvalConfig(
    evaluators = ["qa"]
)

client.run_on_dataset(
    dataset_name = "unstruct_qa_evals", 
    llm_or_chain_factory = rag_chain, 
    evaluation = eval_config, 
    verbose = True
)

View the evaluation results for project 'flowery-laugh-57' at:
https://smith.langchain.com/o/bed0d9b9-d8f5-5181-ba80-dac89a730fec/datasets/a4b58ebc-7b6c-42af-9baf-a2d975e890c8/compare?selectedSessions=76959c59-be19-47e8-9b32-e6d788f8940c

View all tests for Dataset unstruct_qa_evals at:
https://smith.langchain.com/o/bed0d9b9-d8f5-5181-ba80-dac89a730fec/datasets/a4b58ebc-7b6c-42af-9baf-a2d975e890c8
[------------------------------------------------->] 2/2

Unnamed: 0,feedback.correctness,error,execution_time,run_id
count,2.0,0.0,2.0,2
unique,,0.0,,2
top,,,,b778a328-691b-4a34-9ba7-81838d4cdd4d
freq,,,,1
mean,0.5,,2.671546,
std,0.707107,,0.156374,
min,0.0,,2.560973,
25%,0.25,,2.616259,
50%,0.5,,2.671546,
75%,0.75,,2.726832,


{'project_name': 'flowery-laugh-57',
 'results': {'c1a04913-6e28-439f-9795-d03cbb4f3fc6': {'input': {'question': "What's a langsmith dataset?"},
   'feedback': [EvaluationResult(key='correctness', score=0, value='INCORRECT', comment='INCORRECT', correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('402c9aba-d8d0-4288-b14b-d8e906a73e6e'))}, feedback_config=None, source_run_id=None, target_run_id=None, extra=None)],
   'execution_time': 2.782119,
   'run_id': 'b778a328-691b-4a34-9ba7-81838d4cdd4d',
   'output': 'I can\'t find the specific details about a "langsmith dataset" in the provided documents. However, in the context of LangChain and similar frameworks, a dataset typically refers to a collection of data that can be used for training, testing, or evaluating language models. If "langsmith dataset" refers to a specific feature or concept within LangChain, I would need more detailed documentation to provide an accurate description.',
   'reference': {'answer': 'A LangSmith da

### Structured Data Comparison Evaluation

In [47]:
# Download a public dataset on LangSmith
dataset_url = "https:/smith.langchain.com/public/08ab7912-006e-4c00-a973-0f833e74907b/d"
dataset_name = "Contract Extraction Eval Data"

# Clone the downloaded dataset
client.clone_public_dataset(dataset_url, dataset_name = dataset_name)

Dataset(name='Contract Extraction Eval Data', description=None, data_type=<DataType.kv: 'kv'>, id=UUID('c8f2934d-a5a9-4900-bf59-7cc0f9bed759'), created_at=datetime.datetime(2025, 10, 16, 21, 6, 1, 771546, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2025, 10, 16, 21, 6, 1, 771546, tzinfo=datetime.timezone.utc), example_count=0, session_count=0, last_session_start_time=None, inputs_schema=None, outputs_schema=None)

In [51]:
from typing import List, Optional
from pydantic import BaseModel

# Define the schema of party's address
class Address(BaseModel):
    street: str
    city: str
    state: str
    
# Define the schema of party in the contract
class Party(BaseModel): 
    name: str
    address: Address

# Top-level schema for the entire contract
class Contract(BaseModel):     
    document_title: str
    effective_date: str
    parties: List[Party]


In [60]:
from langchain.chains import create_extraction_chain
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model = "gpt-4o", max_tokens = 3000)

extraction_chain = create_extraction_chain(Contract.model_json_schema(),llm)

In [None]:
from langchain.evaluation import EvaluatorType

eval_config = RunEvalConfig(
        evaluators = ["json_edit_distance"]
)

client.run_on_dataset(
    dataset_name = "Contract Extraction Eval Data", 
    llm_or_chain_factory = extraction_chain,
    evaluation = eval_config,
    input_mapper=lambda x: {"input": x["context"]},
    output_mapper = lambda x: x["text"],  
    verbose = True
)