In [4]:
# Create dataset
#we will create 5 datapoints to evaluate on. We will be evaluating a question-answering 
# application. The input will be a question, and the output will be an answer.
#  Since this is a question-answering application, we can define the expected answer. 
#Let's see how to create and upload this dataset to LangSmith

In [3]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    inputs=[
        {"question": "What is LangChain?"},
        {"question": "What is LangSmith?"},
        {"question": "What is OpenAI?"},
        {"question": "What is Google?"},
        {"question": "What is Mistral?"},
    ],
    outputs=[
        {"answer": "A framework for building LLM applications"},
        {"answer": "A platform for observing and evaluating LLM applications"},
        {"answer": "A company that creates Large Language Models"},
        {"answer": "A technology company known for search"},
        {"answer": "A company that creates Large Language Models"},
    ],
    dataset_id=dataset.id,
)

In [5]:
#Now, if we go the LangSmith UI and look for QA Example Dataset in the Datasets & Testing page, 
# when we click into it we should see that
#  we have five new examples

Define Metrics <br>
For the first, we will use an LLM to judge whether the output is correct <br> (with respect to the expected output). This LLM-as-a-judge is relatively common for cases that are too<br >complex to measure with a simple function. We can define our own prompt and LLM to use for evaluation here:

In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator

_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
You are grading the following question:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
Respond with CORRECT or INCORRECT:
Grade:
"""

PROMPT = PromptTemplate(
    input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE
)
eval_llm = ChatOpenAI(temperature=0.0)

qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm, "prompt": PROMPT})

For evaluating the length of the response, this is a lot easier! We can just define a simple function that checks whether the actual output is less than 2x the length of the expected result.

In [7]:
from langsmith.schemas import Run, Example

def evaluate_length(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("answer") or ""
    score = int(len(prediction) < 2 * len(required))
    return {"key":"length", "score": score}

<h1>Run Evaluations</h1>
Great! So now how do we run evaluations? Now that we have a dataset and evaluators, all that we need is our application! We will build a simple application that just has a system message with instructions on how to respond and then passes it to the LLM. We will build this using the OpenAI SDK directly

In [8]:
import openai

openai_client = openai.Client()

def my_app(question):
    return openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence)."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content

Before running this through LangSmith evaluations, we need to define a simple wrapper that maps the input keys from our dataset to the function we want to call, and then also maps the output of the function to the output key we expect.

In [9]:
def langsmith_app(inputs):
    output = my_app(inputs["question"])
    return {"output": output}

In [12]:
## Great! Now we're ready to run evaluation. Let's do it! 
# Click on link to see evaluation results

In [11]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="openai-3.5", # A prefix for your experiment names to easily identify them
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'openai-3.5-5c2d8981' at:
https://smith.langchain.com/o/36dd4cf9-88ca-5121-8019-46bec09db125/datasets/c9d53d0c-81f3-4e08-b803-d3f5ab1f8326/compare?selectedSessions=1c9f2e6f-d2ce-4130-8211-284c0f53d125




5it [00:08,  1.68s/it]


In [13]:
# Let's now try it out with a different model! Let's try gpt-4-turbo

import openai

openai_client = openai.Client()

def my_app_1(question):
    return openai_client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence)."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content


def langsmith_app_1(inputs):
    output = my_app_1(inputs["question"])
    return {"output": output}

from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app_1, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="openai-4", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'openai-4-5e0210d8' at:
https://smith.langchain.com/o/36dd4cf9-88ca-5121-8019-46bec09db125/datasets/c9d53d0c-81f3-4e08-b803-d3f5ab1f8326/compare?selectedSessions=2e871dff-768e-4cac-a1a1-3f24c4f5bd03




5it [00:11,  2.28s/it]


In [14]:
##And now let's use GPT-4 but 
# also update the prompt to be a bit more strict in requiring the answer to be short.

In [15]:
import openai

openai_client = openai.Client()

def my_app_2(question):
    return openai_client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence). Do NOT use more than ten words."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content


def langsmith_app_2(inputs):
    output = my_app_2(inputs["question"])
    return {"output": output}

from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app_2, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="strict-openai-4", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'strict-openai-4-ff0f0388' at:
https://smith.langchain.com/o/36dd4cf9-88ca-5121-8019-46bec09db125/datasets/c9d53d0c-81f3-4e08-b803-d3f5ab1f8326/compare?selectedSessions=e6266846-9da6-4b68-94e4-f3551423c07b




5it [00:06,  1.39s/it]


<h1>Set up automated testing to run in CI/CD</h1>
Now that we've run this in a one-off manner, we can set it to run in an automated fashion.
We can do this pretty easily by just including it as a pytest file that we run in CI/CD.
As part of this, we can either just log the results OR set up some criteria to determine 
if it passes or not. For example, if I wanted to ensure that we always got at least 80% of 
generated responses passing the length check, we could set that up with a test like:

In [16]:

def test_length_score() -> None:
    """Test that the length score is at least 80%."""
    experiment_results = evaluate(
        langsmith_app, # Your AI system
        data=dataset_name, # The data to predict and grade over
        evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    )
    # This will be cleaned up in the next release:
    feedback = client.list_feedback(
        run_ids=[r.id for r in client.list_runs(project_name=experiment_results.experiment_name)],
        feedback_key="length"
    )
    scores = [f.score for f in feedback]
    assert sum(scores) / len(scores) >= 0.8, "Aggregate score should be at least .8"

