In [13]:
import os

os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "agent-test"

# The below examples use the OpenAI API, so you will need
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [14]:
import uuid

from langsmith import Client

client = Client()

In [15]:
questions = [
    (
        "Why was was a $10 calculator app one of the best-rated Nintendo Switch games?",
        {
            "reference": "It became an internet meme due to its high price point.",
            "expected_steps": ["duck_duck_go"],
        },
    ),
    (
        "hi",
        {
            "reference": "Hello, how can I assist you?",
            "expected_steps": [],  # Expect a direct response
        },
    ),
    (
        "Who is Dejan Trajkov?",
        {
            "reference": "Macedonian Professor, Immunologist and Physician",
            "expected_steps": ["duck_duck_go"],
        },
    ),
    (
        "Who won the 2023 U23 world wresting champs (men's freestyle 92 kg)",
        {
            "reference": "Muhammed Gimri from turkey",
            "expected_steps": ["duck_duck_go"],
        },
    ),
    (
        "What's my first meeting on Friday?",
        {
            "reference": 'Your first meeting is 8:30 AM for "Team Standup"',
            "expected_steps": ["check_calendar"],  # Only expect calendar tool
        },
    ),
]

In [16]:
uid = uuid.uuid4()
dataset_name = f"Agent Eval Example {uid}"
ds = client.create_dataset(
    dataset_name=dataset_name,
    description="An example agent evals dataset using search and calendar checks.",
)
client.create_examples(
    inputs=[{"question": q[0]} for q in questions],
    outputs=[q[1] for q in questions],
    dataset_id=ds.id,
)

In [23]:
from dateutil.parser import parse

from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import DuckDuckGoSearchResults, tool
from langchain.tools.render import format_tool_to_openai_function


@tool
def check_calendar(date: str) -> list:
    """Check the user's calendar for a meetings on the specified datetime (in iso format)."""
    date_time = parse(date)
    # A placeholder to demonstrate with multiple tools.
    # It's easy to mock tools when testing.
    if date_time.weekday() == 4:
        return [
            "8:30 : Team Standup",
            "9:00 : 1 on 1",
            "9:45 design review",
        ]
    return ["Focus time"]  # If only...


def agent_factory():
    llm = ChatOpenAI(
        model="gpt-3.5-turbo-16k",
        temperature=0,
    )
    tools = [
        DuckDuckGoSearchResults(
            name="duck_duck_go"
        ),  # General internet search using DuckDuckGo
        check_calendar,
    ]
    llm_with_tools = llm.bind(
        functions=[format_tool_to_openai_function(t) for t in tools]
    )
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "You are a helpful assistant."),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
            ("user", "{input}"),
        ]
    )

    runnable_agent = (
        {
            "input": lambda x: x["question"],
            "agent_scratchpad": lambda x: format_to_openai_functions(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIFunctionsAgentOutputParser()
    )

    return AgentExecutor(
        agent=runnable_agent,
        tools=tools,
        handle_parsing_errors=True,
        return_intermediate_steps=True,
    )

In [18]:
from typing import Optional

from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

class AgentTrajectoryEvaluator(RunEvaluator):
    def evaluate_run(
            self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannont be None")
        intermediate_steps = run.outputs["intermediate_steps"]
        trajectory = [action.tool for action, _ in intermediate_steps]
        expected_trajectory = example.outputs["expected_steps"]
        score = int(trajectory == expected_trajectory)
        return EvaluationResult(key="Intermediate steps correctness", score=score)

In [25]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[AgentTrajectoryEvaluator()],
    # We now need to specify this because we have multiple outputs in our dataset
    reference_key="reference",
)

chain_results = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=agent_factory,
    evaluation=evaluation_config,
    verbose=True,
    tags=["agent-eval-example"],
)


View the evaluation results for project 'new-sign-82' at:
https://smith.langchain.com/o/ca43bddc-1333-5cd7-840e-ed9a34e1ed28/datasets/4f3248c5-fb75-460e-9ebd-2ca2a07ddd93/compare?selectedSessions=e93eda5b-c69e-4b21-9c0e-d54cee2e11eb

View all tests for Dataset Agent Eval Example 7f387639-f6a0-498a-aa8c-b1ab11b9f9e9 at:
https://smith.langchain.com/o/ca43bddc-1333-5cd7-840e-ed9a34e1ed28/datasets/4f3248c5-fb75-460e-9ebd-2ca2a07ddd93
[>                                                 ] 0/5

Task was destroyed but it is pending!
task: <Task pending name='Task-24' coro=<AsyncDDGS.__aexit__() running at C:\Users\THE BARON\AppData\Roaming\Python\Python311\site-packages\duckduckgo_search\duckduckgo_search_async.py:46>>
  self._ready.clear()
Task was destroyed but it is pending!
task: <Task pending name='Task-6' coro=<AsyncCurl._force_timeout() running at C:\Users\THE BARON\AppData\Roaming\Python\Python311\site-packages\curl_cffi\aio.py:168> wait_for=<Future pending cb=[Task.__wakeup()]>>
Task was destroyed but it is pending!
task: <Task pending name='Task-19' coro=<AsyncDDGS.__aexit__() running at C:\Users\THE BARON\AppData\Roaming\Python\Python311\site-packages\duckduckgo_search\duckduckgo_search_async.py:46>>
  self._ready.clear()
Task was destroyed but it is pending!
task: <Task pending name='Task-29' coro=<AsyncDDGS.__aexit__() running at C:\Users\THE BARON\AppData\Roaming\Python\Python311\site-packages\duckduckgo_search\duckduckgo_search_async.py:46>>
Task was destroyed b

[------------------------------------------------->] 5/5

Unnamed: 0,feedback.correctness,feedback.Intermediate steps correctness,error,execution_time,run_id
count,5.0,5.0,0.0,5.0,5
unique,,,0.0,,5
top,,,,,20b4af5f-37b4-4b0f-950f-e5e47e698dc0
freq,,,,,1
mean,0.0,1.0,,2.67882,
std,0.0,0.0,,2.077773,
min,0.0,1.0,,0.67466,
25%,0.0,1.0,,1.204033,
50%,0.0,1.0,,2.688272,
75%,0.0,1.0,,2.823264,
