# Using the LLM as a judge: agent evaluations

### imports

In [54]:
%load_ext autoreload
%autoreload 2

# Standard libs
import os
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

import sys
sys.path.append("..")

# Project modules
from read import read_repo_data
from chunks import chunk_text
from utils import load_chunks_jsonl, save_chunks_jsonl, load_log_data_from_file
from search import create_text_index, create_vector_index, load_embedding_model
from agent import create_agent, run_agent, run_agent_async
from agent_tools import make_agent_tools
from log import log_interaction_to_file
from eval import generate_questions_from_chunks, evaluate_log_record

from schemas.evaluation import EvaluationChecklist, QuestionsList
from prompts.templates import user_prompt_template, qg_prompt_template

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### params

In [39]:
REPO_OWNER = "langchain-ai"
REPO_NAME = "langchain"
BRANCH = "master"

CHUNKS_FILE = "../langchain_chunks.jsonl"
SYSTEM_PROMPT = "../prompts/system_prompt.yml"
EVAL_PROMPT = "../prompts/eval_prompt.yml"
QG_PROMPT = "../prompts/quest_gen_prompt.yml"
LOG_DIR = Path('logs')
MODEL_NAME_RUN = "gpt-4o-mini"
MODEL_NAME_EVAL = "gpt-5-nano"

# 1. Create chunks

Create chunks if the file does not yet exist

In [8]:
if not Path(CHUNKS_FILE).exists():
    docs = read_repo_data(REPO_OWNER, REPO_NAME, branch=BRANCH)
    chunks = []
    for doc in docs:
        chunks.extend(chunk_text(doc["content"], method="section", level=2))
    save_chunks_jsonl(chunks, CHUNKS_FILE)
else:
    print("Chunks file already exists. Loading existing chunks...")
    chunks = load_chunks_jsonl(CHUNKS_FILE)

print(f"Loaded {len(chunks)} chunks from {CHUNKS_FILE}")


Chunks file already exists. Loading existing chunks...
Loaded 1451 chunks from ../langchain_chunks.jsonl


In [10]:
print(chunks[0].keys())

dict_keys(['start', 'end', 'chunk', 'filename'])


# 2. Build indexes and search tools

In [12]:
# Text index
text_index = create_text_index(chunks)

# Vector index
embedding_model = load_embedding_model()
vector_index = create_vector_index(chunks, embedding_model)

# Create agent search Tools
tools = make_agent_tools(text_index, vector_index, embedding_model)


In [13]:
tools[1]

<function agent_tools.make_agent_tools.<locals>.vector_search_tool(query: str, num_results: int = 5) -> List[Dict[str, Any]]>

# 3. Create the agents



In [14]:
# 0 text search, 1 vector search, 2 hybrid search
search_tool = tools[1]
# Create QA agent with search tools
qa_agent = create_agent(
    model_name=MODEL_NAME_RUN,
    prompt_file_path=SYSTEM_PROMPT,
    tools=[search_tool],
    agent_name="QA_Agent"
)
# Create evaluation agent
eval_agent = create_agent(
    model_name=MODEL_NAME_EVAL,
    prompt_file_path=EVAL_PROMPT,
    tools=[],  # eval agent doesn’t need search tools
    agent_name="Eval_Agent",
    output_type=EvaluationChecklist
)
# Create the question generation agent
qg_agent = create_agent(
    agent_name="question_generator",
    prompt_file_path=QG_PROMPT,
    model_name=MODEL_NAME_RUN,
    tools=[],  # questions agent doesn’t need search tools
    output_type=QuestionsList
)


# 4. Generate Eval questions

In [33]:
questions = await generate_questions_from_chunks(
    chunks,
    qg_agent,
    qg_prompt_template,
    num_questions=10
)

for q in questions:
    print(f"{q['filename']}: {q['question']}")

langchain-master/docs/docs/integrations/providers/zeusdb.mdx: How do I check the quantization status of the index in ZeusDB?
langchain-master/docs/docs/concepts/vectorstores.mdx: What are the key methods provided by LangChain for managing documents in a vector store?
langchain-master/docs/docs/additional_resources/arxiv_references.mdx: What is the AlphaCodium approach and how does it differ from traditional methods of code generation using LLMs?
langchain-master/docs/docs/integrations/providers/ctranslate2.mdx: What command do I need to run to install the ctranslate2 Python package?
langchain-master/docs/docs/integrations/providers/cassandra.mdx: How do I integrate Cassandra as a vector store in my Langchain project?
langchain-master/docs/docs/integrations/providers/baidu.mdx: What is the purpose of the `QianfanLLMEndpoint` in Langchain Community?
langchain-master/docs/docs/contributing/reference/repo_structure.mdx: What are the steps to contribute code to the LangChain codebase, and w

In [34]:
# Ensure output folder exists
out_dir = Path("../questions")
out_dir.mkdir(exist_ok=True)

# Create unique filename with timestamp
questions_file = out_dir / f"questions_{datetime.now().strftime('%Y%m%d%H%M%S')}.jsonl"

# Save the list of question dicts
save_chunks_jsonl(questions, str(questions_file))

print(f"✅ Saved {len(questions)} questions to {questions_file}")

✅ Saved 10 questions to ../questions/questions_20250930010308.jsonl


In [35]:
questions[0]['question']

'How do I check the quantization status of the index in ZeusDB?'

# 5. Generation of answers

In [36]:
# Iterate over generated questions
for q in tqdm(questions, desc="Evaluating questions"):
    print(f"❓ {q['question']}")

    # Run evaluation agent (sync version)
    result = await run_agent_async(qa_agent, q["question"])

    # Print model's structured output or text
    print("✅ Response Result:")
    print(result.output)  # or pprint(result.output) if it's a dict/list

    # Save log of interaction
    log_interaction_to_file(
        eval_agent,
        result.new_messages(),
        source="ai-generated"
    )

    print("-" * 60)

Evaluating questions:   0%|          | 0/10 [00:00<?, ?it/s]

❓ How do I check the quantization status of the index in ZeusDB?
✅ Response Result:
To check the quantization status of the index in ZeusDB, you can use the following code snippet in your application:

```python
# Check quantization status
if vector_store.is_quantized():
    progress = vector_store.get_training_progress()
    print(f"Quantization training: {progress:.1f}% complete")
else:
    print("Index is not quantized")
```

### Steps:
- Call `vector_store.is_quantized()` to determine if the index is quantized.
- If it is quantized, retrieve and print the training progress using `vector_store.get_training_progress()`.
- If it's not quantized, it will print that the index is not quantized.

This method allows you to easily monitor the quantization status and its progress.

### Example Output:
- If the index is not quantized: "Index is not quantized"
- If it is quantized: "Quantization training: X% complete"

Sources:
- langchain-master/docs/docs/integrations/providers/zeusdb.mdx (Mo

# 6. Evaluate responses

In [43]:
# Build eval dataset
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    # Only use ai-generated question logs
    log_record = load_log_data_from_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

In [52]:
eval_set[0]

{'agent_name': 'Eval_Agent',
 'system_prompt': "Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).\nWe also include the entire log (<LOG>) for analysis.\n\nFor each item, check if the condition is met.\n\nChecklist:\n\n- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)\n- instructions_avoid: The agent avoided doing things it was told not to do\n- answer_relevant: The response directly addresses the user's question\n- answer_clear: The answer is clear and correct\n- answer_citations: The response includes proper citations or sources when required\n- completeness: The response is complete and covers all key aspects of the request\n- tool_call_search: Is the search tool invoked?\n\nOutput true/false for each check and provide a short explanation for your judgment.",
 'provider': 'openai',
 'model': 'gpt-5-nano',
 'tools': [],
 'messages': [{'parts': [{'content': 'What are the key methods provide

In [57]:
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

  0%|          | 0/10 [00:00<?, ?it/s]

In [67]:
eval_results[0][1]

EvaluationChecklist(checklist=[EvaluationCheck(check_name='instructions_follow', justification="The user asked to use this checklist to evaluate the answer; the assistant's answer did not itself perform an evaluation per the checklist and instead provided content answering the question. This does not follow the meta-instruction to produce an evaluation.", check_pass=False), EvaluationCheck(check_name='instructions_avoid', justification='No disallowed instructions or content present; the answer is safe. True.', check_pass=True), EvaluationCheck(check_name='answer_relevant', justification="The answer directly addresses the user's question by listing key LangChain vector store management methods (add_documents, delete, similarity_search). True.", check_pass=True), EvaluationCheck(check_name='answer_clear', justification="The answer is clearly structured with bullet points and examples; it's easy to understand. True.", check_pass=True), EvaluationCheck(check_name='answer_citations', justif

In [71]:
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file_path'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }
    # Add checks to row dict
    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)
    # Add justifications to row dict
    justifications = {f"{c.check_name}_justification": c.justification for c in eval_result.checklist}
    row.update(justifications)

    rows.append(row)

In [74]:
import pandas as pd

df = pd.DataFrame(rows)
df

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,instructions_follow_justification,instructions_avoid_justification,answer_relevant_justification,answer_clear_justification,answer_citations_justification,completeness_justification,tool_call_search_justification
0,Eval_Agent_20250930_000339_a3a8e6.json,What are the key methods provided by LangChain...,# Key Methods for Managing Documents in LangCh...,False,True,True,True,True,True,False,The user asked to use this checklist to evalua...,No disallowed instructions or content present;...,The answer directly addresses the user's quest...,The answer is clearly structured with bullet p...,Citations/sources are provided in a 'Sources' ...,"Covers core methods; while not exhaustive, it ...",No search tool was invoked in the answer conte...
1,Eval_Agent_20250930_000359_8a0a2c.json,What command do I need to run to install the c...,"To install the `ctranslate2` Python package, y...",True,True,True,True,True,True,False,The answer provided the correct install comman...,No disallowed content or actions were present ...,The answer directly answers: what command to r...,The command is clear and presented in a code b...,The answer included a sources section citing t...,The answer provides the exact command and a so...,No search tool was invoked in the answer; tool...
2,Eval_Agent_20250930_000418_d9fda7.json,What is the purpose of the `QianfanLLMEndpoint...,## Purpose of `QianfanLLMEndpoint`\n\nThe `Qia...,False,True,False,False,True,False,True,The user asked to evaluate the agent's answer ...,No disallowed instruction was violated; there ...,The answer addressed the question about the Qi...,Because the response did not address the evalu...,The answer included a sources section and cita...,The evaluation task was not completed; therefo...,A vector_search_tool was invoked earlier in th...
3,Eval_Agent_20250930_000348_61a5ed.json,What is the AlphaCodium approach and how does ...,## AlphaCodium Approach vs. Traditional Method...,False,True,False,False,True,False,True,The answer did not perform the task of evaluat...,No disallowed content; no irrelevant or unsafe...,The answer did not address the user's question...,"Although the content is clear, it does not add...",The answer includes a citation to a source.,The answer does not cover the evaluation; inco...,The log shows a vector_search_tool call occurr...
4,Eval_Agent_20250930_000406_b41d5d.json,How do I integrate Cassandra as a vector store...,## Integrating Cassandra as a Vector Store in ...,False,True,True,True,True,True,False,The user asked to use the checklist to evaluat...,There were no prohibitions in the user instruc...,The answer content directly addresses how to i...,The steps are clearly structured with code blo...,Includes references and links to documentation...,"Covers importing the Cassandra vector store, o...",No search tool was invoked within the answer c...
5,Eval_Agent_20250930_000426_f1ba33.json,What are the steps to contribute code to the L...,"To contribute code to the LangChain codebase, ...",True,True,True,True,True,True,True,Invoked a tool to comply with the meta-instruc...,No disallowed content; answer will be provided...,The answer will address steps to contribute co...,The forthcoming answer will be clear and struc...,Citations/sources will be provided as links or...,Will include both steps and locations for guid...,A tool was invoked to comply with execution po...
6,Eval_Agent_20250930_000329_089c82.json,How do I check the quantization status of the ...,To check the quantization status of the index ...,True,True,True,True,True,True,True,The evaluation process followed the user's ins...,No restricted actions were specified by the us...,The evaluation assesses whether the provided a...,The evaluation notes that the answer is clear ...,The answer includes a source citation to langc...,"The evaluation deems the answer complete, cove...",The log shows a tool call (vector_search_tool)...
7,Eval_Agent_20250930_000434_a540a0.json,How can I use the .bind_tools() method to conn...,To connect multiple tools to your LangChain mo...,,,,,,,,,,,,,,
8,Eval_Agent_20250930_000448_c5dfde.json,What is the command to install the langchain-s...,I don't know based on the repository documenta...,False,True,False,False,True,False,False,The user asked for an evaluation checklist; th...,No disallowed content; nothing inappropriate i...,The answer did not address the user's specific...,The answer was a simple negation with no actio...,No citations are required for this non-citatio...,The response missing the actual command; not c...,No search tool was invoked in the answer.
9,Eval_Agent_20250930_000442_0cb851.json,How do you start the LangServe application usi...,To start the LangServe application using the c...,True,True,True,True,True,True,False,The evaluation used the provided checklist to ...,The evaluation does not introduce extraneous c...,The evaluation directly assesses whether the g...,The evaluation clearly states which aspects we...,The evaluation notes that the given answer inc...,All key aspects of the evaluation checklist ar...,The evaluation itself did not perform a search...


In [75]:
# one row has no assessment need to check drop for now
df = df.dropna()
df

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,instructions_follow_justification,instructions_avoid_justification,answer_relevant_justification,answer_clear_justification,answer_citations_justification,completeness_justification,tool_call_search_justification
0,Eval_Agent_20250930_000339_a3a8e6.json,What are the key methods provided by LangChain...,# Key Methods for Managing Documents in LangCh...,False,True,True,True,True,True,False,The user asked to use this checklist to evalua...,No disallowed instructions or content present;...,The answer directly addresses the user's quest...,The answer is clearly structured with bullet p...,Citations/sources are provided in a 'Sources' ...,"Covers core methods; while not exhaustive, it ...",No search tool was invoked in the answer conte...
1,Eval_Agent_20250930_000359_8a0a2c.json,What command do I need to run to install the c...,"To install the `ctranslate2` Python package, y...",True,True,True,True,True,True,False,The answer provided the correct install comman...,No disallowed content or actions were present ...,The answer directly answers: what command to r...,The command is clear and presented in a code b...,The answer included a sources section citing t...,The answer provides the exact command and a so...,No search tool was invoked in the answer; tool...
2,Eval_Agent_20250930_000418_d9fda7.json,What is the purpose of the `QianfanLLMEndpoint...,## Purpose of `QianfanLLMEndpoint`\n\nThe `Qia...,False,True,False,False,True,False,True,The user asked to evaluate the agent's answer ...,No disallowed instruction was violated; there ...,The answer addressed the question about the Qi...,Because the response did not address the evalu...,The answer included a sources section and cita...,The evaluation task was not completed; therefo...,A vector_search_tool was invoked earlier in th...
3,Eval_Agent_20250930_000348_61a5ed.json,What is the AlphaCodium approach and how does ...,## AlphaCodium Approach vs. Traditional Method...,False,True,False,False,True,False,True,The answer did not perform the task of evaluat...,No disallowed content; no irrelevant or unsafe...,The answer did not address the user's question...,"Although the content is clear, it does not add...",The answer includes a citation to a source.,The answer does not cover the evaluation; inco...,The log shows a vector_search_tool call occurr...
4,Eval_Agent_20250930_000406_b41d5d.json,How do I integrate Cassandra as a vector store...,## Integrating Cassandra as a Vector Store in ...,False,True,True,True,True,True,False,The user asked to use the checklist to evaluat...,There were no prohibitions in the user instruc...,The answer content directly addresses how to i...,The steps are clearly structured with code blo...,Includes references and links to documentation...,"Covers importing the Cassandra vector store, o...",No search tool was invoked within the answer c...
5,Eval_Agent_20250930_000426_f1ba33.json,What are the steps to contribute code to the L...,"To contribute code to the LangChain codebase, ...",True,True,True,True,True,True,True,Invoked a tool to comply with the meta-instruc...,No disallowed content; answer will be provided...,The answer will address steps to contribute co...,The forthcoming answer will be clear and struc...,Citations/sources will be provided as links or...,Will include both steps and locations for guid...,A tool was invoked to comply with execution po...
6,Eval_Agent_20250930_000329_089c82.json,How do I check the quantization status of the ...,To check the quantization status of the index ...,True,True,True,True,True,True,True,The evaluation process followed the user's ins...,No restricted actions were specified by the us...,The evaluation assesses whether the provided a...,The evaluation notes that the answer is clear ...,The answer includes a source citation to langc...,"The evaluation deems the answer complete, cove...",The log shows a tool call (vector_search_tool)...
8,Eval_Agent_20250930_000448_c5dfde.json,What is the command to install the langchain-s...,I don't know based on the repository documenta...,False,True,False,False,True,False,False,The user asked for an evaluation checklist; th...,No disallowed content; nothing inappropriate i...,The answer did not address the user's specific...,The answer was a simple negation with no actio...,No citations are required for this non-citatio...,The response missing the actual command; not c...,No search tool was invoked in the answer.
9,Eval_Agent_20250930_000442_0cb851.json,How do you start the LangServe application usi...,To start the LangServe application using the c...,True,True,True,True,True,True,False,The evaluation used the provided checklist to ...,The evaluation does not introduce extraneous c...,The evaluation directly assesses whether the g...,The evaluation clearly states which aspects we...,The evaluation notes that the given answer inc...,All key aspects of the evaluation checklist ar...,The evaluation itself did not perform a search...


In [86]:
checks = ['instructions_follow',
  'instructions_avoid',
  'answer_relevant',
  'answer_clear',
  'answer_citations']

df[checks].astype(int).describe()

Unnamed: 0,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations
count,9.0,9.0,9.0,9.0,9.0
mean,0.444444,1.0,0.666667,0.666667,1.0
std,0.527046,0.0,0.5,0.5,0.0
min,0.0,1.0,0.0,0.0,1.0
25%,0.0,1.0,0.0,0.0,1.0
50%,0.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0


In [None]:
df.to_csv('eval_results.csv', index=False)