# Experiments

### Setup

In [None]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

True

Here is the RAG Application that we've been working with throughout this course

In [None]:
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings
from langsmith import traceable
from openai import OpenAI
from typing import List
import nest_asyncio

# TODO: Configure this model!
MODEL_NAME = "gpt-4o"
MODEL_PROVIDER = "openai"
APP_VERSION = 1.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the latest question in the conversation. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
"""

openai_client = OpenAI()

def get_vector_db_retriever():
    persist_path = os.path.join(tempfile.gettempdir(), "union.parquet")
    embd = OpenAIEmbeddings()

    # If vector store exists, then load it
    if os.path.exists(persist_path):
        vectorstore = SKLearnVectorStore(
            embedding=embd,
            persist_path=persist_path,
            serializer="parquet"
        )
        return vectorstore.as_retriever(lambda_mult=0)

    # Otherwise, index LangSmith documents and create new vector store
    ls_docs_sitemap_loader = SitemapLoader(web_path="https://docs.smith.langchain.com/sitemap.xml", continue_on_failure=True)
    ls_docs = ls_docs_sitemap_loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(ls_docs)

    vectorstore = SKLearnVectorStore.from_documents(
        documents=doc_splits,
        embedding=embd,
        persist_path=persist_path,
        serializer="parquet"
    )
    vectorstore.persist()
    return vectorstore.as_retriever(lambda_mult=0)

nest_asyncio.apply()
retriever = get_vector_db_retriever()

"""
retrieve_documents
- Returns documents fetched from a vectorstore based on the user's question
"""
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

"""
generate_response
- Calls `call_openai` to generate a model response after formatting inputs
"""
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": RAG_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return call_openai(messages)

"""
call_openai
- Returns the chat completion output from OpenAI
"""
@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    return openai_client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
    )

"""
langsmith_rag
- Calls `retrieve_documents` to fetch documents
- Calls `generate_response` to generate a response based on the fetched documents
- Returns the model response
"""
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [43]:
# Example usage
answer = langsmith_rag("WHAT IS THE DIFFERENCE BETWEEN LANGSMITH AND LANGCHAIN?")
print(answer)

LangSmith and LangChain serve different purposes within the AI and LLM ecosystems. LangSmith is a platform focused on LLM observability and evaluation, designed for scalability and primarily used for managing and optimizing AI applications. In contrast, LangChain is a framework for building applications powered by language models, with functionalities for various methods such as retrieval and prompt engineering.


### Experiment

Here is a code snippet that should look similar to what you see from the starter code!

There are a few important components here.

1. We have defined an Evaluator
2. We pipe our dataset examples (dict) to the shape of input that our function `langsmith_rag` takes (str) using a target function

In [44]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

# Step 1: Check if dataset exists, if not create it
try:
    dataset = client.read_dataset(dataset_name=dataset_name)
    print(f" Dataset '{dataset_name}' found with {dataset.example_count} examples")
except Exception:
    print(f" Dataset '{dataset_name}' not found. Creating it now...")
    
    # Create the dataset
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Golden dataset for RAG application evaluation"
    )
    
    # Add example data
    example_inputs = [
        ("How do I set up tracing to LangSmith if I'm using LangChain?", "To set up tracing to LangSmith while using LangChain, you need to set the environment variable `LANGSMITH_TRACING` to 'true'. Additionally, you must set the `LANGSMITH_API_KEY` environment variable to your API key."),
        ("What is LangSmith used for?", "LangSmith is a platform designed for the development, monitoring, and testing of LLM applications."),
        # Add more examples as needed
    ]
    
    inputs = [{"question": q} for q, _ in example_inputs]
    outputs = [{"output": a} for _, a in example_inputs]
    
    client.create_examples(
        inputs=inputs,
        outputs=outputs,
        dataset_id=dataset.id,
    )
    print(f" Dataset created with {len(example_inputs)} examples")

# Step 2: Define evaluators
def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

# Step 3: Define target function
def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

# Step 4: Run evaluation
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-4o"
)

 Dataset 'RAG Application Golden Dataset' found with 2 examples
View the evaluation results for experiment: 'gpt-4o-4200cf06' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=ddbff85a-6090-4864-b796-c40443b7181b




2it [00:05,  2.86s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.022388,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,c326fa82-da8b-477d-81d8-202fda290707
1,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith using LangChain...,,To set up tracing to LangSmith while using Lan...,0,3.207884,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,f4dfef81-ad14-41ec-b208-37de1540f989


In [None]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-4o"
)

View the evaluation results for experiment: 'gpt-4o-51d34e6f' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=7fbc4bf1-717f-413f-8e9b-73a46a2d1549




2it [00:07,  3.52s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing with LangSmith using LangCha...,,To set up tracing to LangSmith while using Lan...,0,3.533369,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,4e035484-da9c-436c-987b-32d75a5eb715
1,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.836299,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,f3945f05-e855-4838-9a72-604e0af1f148


### Modifying your Application

Now, let's change our model to gpt-35-turbo and see how it performs!

Make this change, and then run this code snippet!

In [None]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-3.5-turbo"
)

View the evaluation results for experiment: 'gpt-3.5-turbo-d8642752' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=87db9346-4dc6-4bc8-9fe2-5e121bda34a5




2it [00:05,  2.61s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing with LangSmith while using L...,,To set up tracing to LangSmith while using Lan...,0,2.325508,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,4e5dc2a2-e031-46ae-91f7-b0d6cdea961a
1,What is LangSmith used for?,LangSmith is a platform used for building prod...,,LangSmith is a platform designed for the devel...,0,2.4064,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,7291544c-b801-463d-a71b-d178cb593575


### Running over Different pieces of Data

##### Dataset Version

You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`

Let's try running on just our initial dataset.

In [41]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

# Step 1: Create or get dataset
try:
    dataset = client.read_dataset(dataset_name=dataset_name)
    print(f" Found existing dataset: {dataset_name}")
except:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Golden dataset for RAG application evaluation"
    )
    print(f" Created new dataset: {dataset_name}")

# Step 2: Add examples if dataset is empty
existing_examples = list(client.list_examples(dataset_name=dataset_name))
print(f" Current examples: {len(existing_examples)}")

if len(existing_examples) == 0:
    print("➕ Adding examples to dataset...")
    
    ### examples are added to the file 
    examples_data = [
        {
            "inputs": {"question": "What is machine learning?"},
            "outputs": {"output": "Machine learning is a subset of AI that enables computers to learn from data without explicit programming."}
        },
        {
            "inputs": {"question": "What is deep learning?"},
            "outputs": {"output": "Deep learning uses neural networks with multiple layers to process complex patterns in data."}
        },
        {
            "inputs": {"question": "What is natural language processing?"},
            "outputs": {"output": "NLP is a field that helps computers understand and process human language."}
        },
        {
            "inputs": {"question": "What is reinforcement learning?"},
            "outputs": {"output": "Reinforcement learning trains models through rewards and penalties based on their actions."}
        },
        {
            "inputs": {"question": "What is a neural network?"},
            "outputs": {"output": "A neural network is a computational model inspired by biological brains, consisting of interconnected nodes."}
        },
        {
            "inputs": {"question": "What is supervised learning?"},
            "outputs": {"output": "Supervised learning trains models on labeled data to predict outcomes for new inputs."}
        },
        {
            "inputs": {"question": "What is unsupervised learning?"},
            "outputs": {"output": "Unsupervised learning finds patterns in data without pre-existing labels."}
        },
        {
            "inputs": {"question": "What is transfer learning?"},
            "outputs": {"output": "Transfer learning reuses a pre-trained model as a starting point for a new task."}
        }
    ]
    
    # Add each example to the dataset
    for example in examples_data:
        client.create_example(
            inputs=example["inputs"],
            outputs=example["outputs"],
            dataset_id=dataset.id
        )
    
    print(f" Added {len(examples_data)} examples to dataset")
else:
    print(" Dataset already contains examples")

# Step 3: Verify examples were added
examples = list(client.list_examples(dataset_name=dataset_name))
print(f" Total examples in dataset: {len(examples)}")

# Step 4: Define evaluator
def is_concise_enough(run, example) -> dict:
    """Check if output is concise compared to reference"""
    outputs = run.outputs if hasattr(run, 'outputs') else {}
    reference_outputs = example.outputs if hasattr(example, 'outputs') else {}
    
    if not outputs or "output" not in outputs:
        return {"key": "is_concise", "score": 0}
    if not reference_outputs or "output" not in reference_outputs:
        return {"key": "is_concise", "score": 1}
    
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

# Step 5: Define target function
def target_function(inputs: dict):
    """RAG function that processes questions"""
    question = inputs["question"]
    
    # TODO: Replace with your actual langsmith_rag implementation
    # For now, using a placeholder
    try:
        result = langsmith_rag(question)
        if isinstance(result, str):
            return {"output": result}
        return result
    except NameError:
        # If langsmith_rag is not defined, use placeholder
        return {"output": f"This is a response about {question}"}

# Step 6: Run evaluation
if len(examples) == 0:
    print(" Dataset is empty! Cannot run evaluation.")
else:
    print("\n Starting evaluation...")
    try:
        results = evaluate(
            target_function,
            data=dataset_name,
            evaluators=[is_concise_enough],
            experiment_prefix="initial dataset version"
        )
        print(" Evaluation complete!")
        print(f" Results: {results}")
    except Exception as e:
        print(f" Evaluation error: {e}")

 Found existing dataset: RAG Application Golden Dataset
 Current examples: 2
 Dataset already contains examples
 Total examples in dataset: 2

 Starting evaluation...
View the evaluation results for experiment: 'initial dataset version-f7ba2be9' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=30d061a4-e99c-4654-a109-7259da12d156




2it [01:56, 58.16s/it] 

 Evaluation complete!
 Results: <ExperimentResults initial dataset version-f7ba2be9>





In [11]:
evaluate(
    target_function,
    data=dataset_name,  # Simplest and most reliable
    evaluators=[is_concise_enough],
    experiment_prefix="initial dataset version"
)

View the evaluation results for experiment: 'initial dataset version-b6c17146' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=93690be0-63b3-477c-8add-ae4786d4e012




2it [00:07,  3.66s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith using LangChain...,,To set up tracing to LangSmith while using Lan...,0,4.061921,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,90940da2-3760-4fe0-b1a2-b81859a583b2
1,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.774645,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,ddabc1b6-10c3-43ad-bd3f-8ce612f3928a


In [48]:
from langsmith import Client, evaluate

client = Client()
dataset_name = "RAG Application Golden Dataset"

###examples are added 


try:
    dataset = client.read_dataset(dataset_name=dataset_name)
    print(f" Found existing dataset: {dataset_name}")
except:
    dataset = client.create_dataset(dataset_name=dataset_name)
    print(f" Created new dataset: {dataset_name}")


examples_to_add = [
    {
        "question": "What is machine learning?",
        "output": "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."
    },
    {
        "question": "What is deep learning?",
        "output": "Deep learning is a subset of machine learning that uses neural networks with multiple layers to analyze data."
    },
    {
        "question": "What is natural language processing?",
        "output": "Natural language processing (NLP) is a field of AI that focuses on the interaction between computers and human language."
    },
    {
        "question": "What is a neural network?",
        "output": "A neural network is a computing system inspired by biological neural networks that processes information through interconnected nodes."
    },
    {
        "question": "What is supervised learning?",
        "output": "Supervised learning is a machine learning approach where models are trained on labeled data to make predictions."
    }
]

# Check if examples already exist
existing_examples = list(client.list_examples(dataset_name=dataset_name))
print(f" Current examples in dataset: {len(existing_examples)}")

if len(existing_examples) == 0:
    print(" Adding examples to dataset...")
    for example in examples_to_add:
        client.create_example(
            inputs={"question": example["question"]},
            outputs={"output": example["output"]},
            dataset_id=dataset.id
        )
    print(f" Added {len(examples_to_add)} examples to dataset")
else:
    print(" Dataset already has examples")

# Step 3: Verify dataset has examples
examples = list(client.list_examples(dataset_name=dataset_name))
print(f" Total examples in dataset: {len(examples)}")

if len(examples) == 0:
    print(" Dataset has no examples! Cannot run evaluation.")
else:
    # Step 4: Define evaluator
    def is_concise_enough(run, example) -> dict:
        """Evaluator that checks if output is concise enough"""
        prediction = run.outputs.get("output", "")
        reference = example.outputs.get("output", "")
        
        if not reference:
            return {"key": "is_concise", "score": 0}
        
        score = len(prediction) < 1.5 * len(reference)
        return {"key": "is_concise", "score": int(score)}
    
    # Step 5: Define target function
    def target_function(inputs: dict):
        """Your RAG function - replace with actual implementation"""
        question = inputs["question"]
        
        # TODO: Replace this with your actual RAG implementation
        # For now, using a simple response
        response = f"This is a response to: {question}"
        
        # Uncomment below when you have langsmith_rag function
        # result = langsmith_rag(question)
        # if isinstance(result, str):
        #     return {"output": result}
        # return result
        
        return {"output": response}
    
    # Step 6: Run evaluation
    print("\n Starting evaluation...")
    results = evaluate(
        target_function,
        data=dataset_name,
        evaluators=[is_concise_enough],
        experiment_prefix="initial dataset version"
    )
    print(" Evaluation complete!")
    print(f" Results: {results}")

 Found existing dataset: RAG Application Golden Dataset
 Current examples in dataset: 2
 Dataset already has examples
 Total examples in dataset: 2

 Starting evaluation...
View the evaluation results for experiment: 'initial dataset version-d3e75f81' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=7de41579-ff82-4c8d-83b2-8c6c3dad5633




2it [00:00,  3.39it/s]

 Evaluation complete!
 Results: <ExperimentResults initial dataset version-d3e75f81>





In [19]:
evaluate(
    target_function,
    data=dataset_name,  # <-- CHANGED THIS LINE
    evaluators=[is_concise_enough],
    experiment_prefix="initial dataset version"
)

View the evaluation results for experiment: 'initial dataset version-9e35c3dc' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=40530d2c-637b-4442-8e94-f2ae0234fabd




2it [00:06,  3.35s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith using LangChain...,,To set up tracing to LangSmith while using Lan...,0,3.633917,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,2bd939bd-1f62-491a-a7ed-2a50a4231e3d
1,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.555561,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,201653de-98fb-412b-8f15-9f8f348281dc


##### Dataset Split

You can run an experiment on a specific split of your dataset, let's try running on the Crucial Examples split.

In [24]:
evaluate(
    target_function,
    data=dataset_name,  # Simplest approach
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial Examples split"
)

View the evaluation results for experiment: 'Crucial Examples split-07aada19' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=b342e997-1a19-476c-8545-8ed92643400b




2it [00:05,  2.96s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith when using Lang...,,To set up tracing to LangSmith while using Lan...,0,3.283367,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,8aca2243-5cfc-4a80-9f14-89953b162239
1,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.145844,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,966b9e5b-80d6-443a-9cdf-32890c76e99f


In [78]:
from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()
dataset_name = "RAG Application Golden Dataset"

# Step 1: Get all examples
examples = list(client.list_examples(dataset_name=dataset_name))
print(f"Total examples: {len(examples)}")

if len(examples) == 0:
    print("⚠️ No examples in dataset!")
else:
    # Step 2: Mark some examples as "Crucial Examples"
    # Let's mark the first 3 examples as crucial
    crucial_example_ids = [ex.id for ex in examples[:3]]
    
    # Step 3: Update examples with the split
    for example_id in crucial_example_ids:
        client.update_example(
            example_id=example_id,
            splits=["Crucial Examples"]  # Add to the split
        )
    
    print(f" Added {len(crucial_example_ids)} examples to 'Crucial Examples' split")
    
    # Step 4: Now evaluate with the split
    # Convert generator to list for the data parameter
    crucial_examples = list(
        client.list_examples(
            dataset_name=dataset_name, 
            splits=["Crucial Examples"]
        )
    )
    
    evaluate(
        langsmith_rag,  # Your target function
        data=crucial_examples,  # Pass as a list
        evaluators=[is_concise_enough],  # Your evaluator
        experiment_prefix="Crucial Examples split"
    )

Total examples: 2


TypeError: Client.update_example() got an unexpected keyword argument 'splits'

In [77]:
# Alternative: Update each example individually
for example in examples[:2]:
    # Get current metadata
    current_metadata = example.metadata or {}
    
    # Update with split info in metadata
    client.update_example(
        example_id=example.id,
        metadata={
            **current_metadata,
            "split": "Crucial Examples"
        }
    )

In [65]:
from langsmith import Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

# List all examples and their splits
examples = list(client.list_examples(dataset_name=dataset_name))

print("Examples and their splits:")
for ex in examples:
    splits = ex.splits if hasattr(ex, 'splits') else []
    print(f"  Example ID: {ex.id}, Splits: {splits}")

Examples and their splits:
  Example ID: b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97, Splits: []
  Example ID: 9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7, Splits: []


In [66]:
from langsmith import Client, evaluate

client = Client()
dataset_name = "RAG Application Golden Dataset"

# Check if any examples exist
examples = list(client.list_examples(dataset_name=dataset_name))
print(f"Found {len(examples)} examples")

if len(examples) == 0:
    print(" No examples in dataset! Add examples first.")
else:
    # Define evaluator and target function
    def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
        score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
        return {"key": "is_concise", "score": int(score)}
    
    def target_function(inputs: dict):
        result = langsmith_rag(inputs["question"])
        if isinstance(result, str):
            return {"output": result}
        return result
    
    # Evaluate WITHOUT splits (will use all examples)
    results = evaluate(
        target_function,
        data=dataset_name,  # Use all examples
        evaluators=[is_concise_enough],
        experiment_prefix="all examples"
    )
    print(" Evaluation complete!")

Found 2 examples
View the evaluation results for experiment: 'all examples-2b8dd322' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=25e45a1e-7cbd-45c4-9abd-9c2409e9426a




1it [00:03,  3.02s/it]Error running target function: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************cFEA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Traceback (most recent call last):
  File "/opt/anaconda3/envs/myenv/lib/python3.11/site-packages/langsmith/evaluation/_runner.py", line 1923, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/117198349.py", line 19, in target_function
    result = langsmith_rag(inputs["question"])
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/252468420.py", line 110, in langsmith_rag
    documents = retrieve_documen

 Evaluation complete!





In [71]:
from langsmith import Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

examples = list(client.list_examples(dataset_name=dataset_name))
print(f"Total examples: {len(examples)}")

if len(examples) > 0:
    crucial_example_ids = [ex.id for ex in examples[:3]]
    
    # Use metadata to mark crucial examples
    for example_id in crucial_example_ids:
        client.update_example(
            example_id=example_id,
            metadata={"is_crucial": True}
        )
    
    print(f" Marked {len(crucial_example_ids)} examples as crucial")
    
    # Evaluate using metadata filter
    evaluate(
        target_function,
        data=client.list_examples(
            dataset_name=dataset_name,
            metadata={"is_crucial": True}
        ),
        evaluators=[is_concise_enough],
        experiment_prefix="Crucial Examples"
    )

Total examples: 2
 Marked 2 examples as crucial
View the evaluation results for experiment: 'Crucial Examples-f9eff15b' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=d3553440-dc58-46f2-93f0-ab6ec3b8f1cb




0it [00:00, ?it/s]Error running target function: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************cFEA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Traceback (most recent call last):
  File "/opt/anaconda3/envs/myenv/lib/python3.11/site-packages/langsmith/evaluation/_runner.py", line 1923, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/117198349.py", line 19, in target_function
    result = langsmith_rag(inputs["question"])
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/252468420.py", line 110, in langsmith_rag
    documents = retrieve_documents(q

In [70]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, splits=["Crucial Examples"]),  # We pass in a list of Splits
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial Examples split"
)

StopIteration: 

##### Specific Data Points

You can specify individual data points to run an experiment over as well

In [61]:
from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()
dataset_name = "RAG Application Golden Dataset"

# Step 1: List all examples and get their IDs
print("Fetching examples from dataset...")
examples = list(client.list_examples(dataset_name=dataset_name))

print(f"\nTotal examples in dataset: {len(examples)}")
print("\n" + "=" * 70)
print("AVAILABLE EXAMPLES")
print("=" * 70)

# Display all examples with their IDs
for i, ex in enumerate(examples, 1):
    print(f"\n{i}. Example ID: {ex.id}")
    print(f"   Question: {ex.inputs.get('question', 'N/A')}")
    if hasattr(ex, 'outputs') and ex.outputs:
        print(f"   Answer: {ex.outputs.get('answer', ex.outputs.get('output', 'N/A'))}")

# Step 2: Select specific example IDs (first two)
if len(examples) >= 2:
    example_id_1 = examples[0].id
    example_id_2 = examples[1].id
    
    print("\n" + "=" * 70)
    print("SELECTED EXAMPLES FOR EVALUATION")
    print("=" * 70)
    print(f"Example 1: {example_id_1}")
    print(f"Example 2: {example_id_2}")
    
    # Step 3: Define evaluator
    def is_concise_enough(run, example) -> dict:
        """Check if answer is concise (under 150 characters)"""
        prediction = run.outputs.get("output", run.outputs.get("answer", ""))
        is_concise = len(prediction) < 150
        
        return {
            "key": "is_concise",
            "score": 1 if is_concise else 0,
            "comment": f"Length: {len(prediction)} chars"
        }
    
    # Step 4: Run evaluation on specific examples
    print("\n" + "=" * 70)
    print("RUNNING EVALUATION")
    print("=" * 70)
    
    results = evaluate(
        langsmith_rag,  # Your RAG function
        data=client.list_examples(
            dataset_name=dataset_name, 
            example_ids=[
                example_id_1,
                example_id_2
            ]
        ),
        evaluators=[is_concise_enough],
        experiment_prefix="two specific example ids"
    )
    
    print("\nEvaluation completed!")
    
else:
    print("\nNeed at least 2 examples in the dataset.")

Fetching examples from dataset...

Total examples in dataset: 2

AVAILABLE EXAMPLES

1. Example ID: 9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7
   Question: How do I set up tracing to LangSmith if I'm using LangChain?
   Answer: To set up tracing to LangSmith while using LangChain, you need to set the environment variable `LANGSMITH_TRACING` to 'true'. Additionally, you must set the `LANGSMITH_API_KEY` environment variable to your API key.

2. Example ID: b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97
   Question: What is LangSmith used for?
   Answer: LangSmith is a platform designed for the development, monitoring, and testing of LLM applications.

SELECTED EXAMPLES FOR EVALUATION
Example 1: 9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7
Example 2: b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97

RUNNING EVALUATION
View the evaluation results for experiment: 'two specific example ids-cd3d2c4c' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?sele

0it [00:00, ?it/s]Error running target function: argument 'text': 'dict' object cannot be converted to 'PyString'
Traceback (most recent call last):
  File "/opt/anaconda3/envs/myenv/lib/python3.11/site-packages/langsmith/evaluation/_runner.py", line 1923, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/252468420.py", line 110, in langsmith_rag
    documents = retrieve_documents(question)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/0k/l8w_v7614tb84g7thk3tbbhm0000gn/T/ipykernel_10571/252468420.py", line 64, in retrieve_documents
    return retriever.invoke(question)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/myenv/lib/python3.11/site-packages/langchain_core/retrievers.py", line 263, in invoke
    result = self._get_relevant_documents(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/myenv/lib/python3.11/site-packages/langchain_core/vector


Evaluation completed!





### Other Parameters

##### Repetitions

You can run an experiment several times to make sure you have consistent results

In [62]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="two repetitions",
    num_repetitions=2   # This field defaults to 1
)

View the evaluation results for experiment: 'two repetitions-c5c98699' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=9f139f89-0ad8-44c7-afed-f6afcc76d599




4it [00:11,  2.86s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith using LangChain...,,To set up tracing to LangSmith while using Lan...,0,3.031042,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,1e6a1667-ddd3-47c5-b897-29fdaf7fc958
1,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.230294,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,208c53d1-e21c-4ecb-bbfd-26689a2f8404
2,How do I set up tracing to LangSmith if I'm us...,"To set up tracing to LangSmith with LangChain,...",,To set up tracing to LangSmith while using Lan...,0,3.167418,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,c4cac408-61c2-4a04-8f8d-2826213ffbd2
3,What is LangSmith used for?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,0,2.311113,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,56faeafb-cc4b-43ae-93ac-8e80707047ca


##### Concurrency
You can also kick off concurrent threads of execution to make your experiments finish faster!

In [52]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="concurrency",
    max_concurrency=3,  # This defaults to None, so this is an improvement!
)

View the evaluation results for experiment: 'concurrency-caed46f8' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=f31ee50c-a62e-4b8e-bde1-a6ccf969bd51




2it [00:00,  3.56it/s]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What is LangSmith used for?,This is a response to: What is LangSmith used ...,,LangSmith is a platform designed for the devel...,1,0.002934,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,c25854f2-a8f1-4938-bcf5-c297d24bf3c7
1,How do I set up tracing to LangSmith if I'm us...,This is a response to: How do I set up tracing...,,To set up tracing to LangSmith while using Lan...,1,0.000893,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,4b265411-0bf9-42d0-8929-5b440af19ef7


##### Metadata 

You can (and should) add metadata to your experiments, to make them easier to find in the UI

In [53]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="metadata added",
    metadata={  # We can pass custom metadata for the experiment, such as the model name
        "model_name": MODEL_NAME
    }
)

View the evaluation results for experiment: 'metadata added-3d119826' at:
https://smith.langchain.com/o/679b2151-9692-435f-9a40-bc673cc68d61/datasets/d97e38c5-b6d8-4686-95c7-dcd88d379157/compare?selectedSessions=50902942-1696-4a36-ad32-b8703a43487d




2it [00:00,  3.24it/s]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What is LangSmith used for?,This is a response to: What is LangSmith used ...,,LangSmith is a platform designed for the devel...,1,0.000826,b6cb2dbe-8d8b-4458-bf94-c37bbabe1d97,1fd4bc12-33f2-4912-b4c8-b23fa3221ae6
1,How do I set up tracing to LangSmith if I'm us...,This is a response to: How do I set up tracing...,,To set up tracing to LangSmith while using Lan...,1,0.000368,9e85c6d6-adb7-4c3f-ae2c-9b5203b9cbc7,eff8a6e9-dac6-498e-a65c-2867df57dc44
