# Agent Quality with RAGAS

In [1]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
import couchbase.auth
import couchbase.cluster
import couchbase.options
import datasets
import dotenv
import os

dotenv.load_dotenv()

conn_opts = couchbase.options.ClusterOptions(
    authenticator=couchbase.auth.PasswordAuthenticator(
        os.getenv("AGENT_CATALOG_USERNAME"), os.getenv("AGENT_CATALOG_PASSWORD")
    )
)
cluster = couchbase.cluster.Cluster.connect(os.getenv("AGENT_CATALOG_CONN_STRING"), conn_opts)

# We will treat the results of our tools as RAGAS context (we need to transpose our generations).
query = cluster.analytics_query("""
    WITH LatestGenerations AS (
        FROM
            `travel-sample`.agent_activity.raw_logs AS s
        WHERE 
            s.`grouping` IS NOT NULL
        GROUP BY 
            s.`grouping`,
            s.session
            GROUP AS g
        LETTING
            msgs = (
                FROM 
                    g AS gi
                SELECT
                    gi.s.content.content,
                    gi.s.content.tool_calls,
                    gi.s.kind,
                    gi.s.timestamp,
                    ROW_NUMBER() OVER () AS row_num
            ),
            first_human = (
                FROM 
                    msgs AS msgsi
                WHERE 
                    msgsi.kind = "human"
                SELECT
                    msgsi.content,
                    msgsi.row_num
                ORDER BY
                    msgsi.timestamp ASC
                LIMIT 1
            )[0],
            last_message = (
                FROM
                    msgs AS msgsi
                SELECT 
                    COALESCE(msgsi.content, msgsi.tool_calls) AS content,
                    msgsi.row_num
                ORDER BY 
                    msgsi.timestamp DESC
                LIMIT 1
            )[0],
            context = (
                FROM 
                    msgs AS msgsi
                WHERE
                    msgsi.row_num != first_human.row_num AND 
                    msgsi.row_num != last_message.row_num
                SELECT VALUE
                   COALESCE(msgsi.content, msgsi.tool_calls)
            )
        SELECT
            first_human.content AS question,
            last_message.content AS answer,
            context AS contexts,
            ROW_NUMBER() OVER () AS row_num
    )
    SELECT
        (FROM LatestGenerations e SELECT VALUE e.question ORDER BY e.row_num) AS question,
        (FROM LatestGenerations e SELECT VALUE e.answer ORDER BY e.row_num) AS answer,
        (FROM LatestGenerations e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts
""")
ragas_dataset = datasets.Dataset.from_dict(list(query)[0])
ragas_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['answer', 'contexts', 'question'],
    num_rows: 19
})

In [3]:
import langchain_openai
import ragas.llms
import ragas.metrics

chat_model = langchain_openai.ChatOpenAI(model="gpt-4o")
evaluator_llm = ragas.llms.LangchainLLMWrapper(chat_model)
score = ragas.evaluate(
    ragas_dataset,
    metrics=[
        ragas.metrics.AnswerRelevancy(llm=evaluator_llm),
        ragas.metrics.ContextUtilization(llm=evaluator_llm),
    ],
)
score.to_pandas()



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (
Evaluating: 100%|██████████| 38/38 [01:19<00:00,  2.10s/it]


Unnamed: 0,question,contexts,answer,answer_relevancy,context_utilization
0,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,mark_task_5c238284_successful({'result': 'LAX'}),0.667974,0.0
1,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,"talk_to_user({'message': ""Great! To help you p...",0.742294,0.833333
2,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,find_direct_routes_between_airports({'argument...,0.700808,1.0
3,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,"talk_to_user({'message': ""Would you like to co...",0.738183,1.0
4,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,talk_to_user({'message': '# Your Travel Itiner...,0.711702,0.380482
5,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,talk_to_user({'message': 'Based on your intere...,0.666787,0.1
6,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,talk_to_user({'message': 'Hello! How can I ass...,0.0,1.0
7,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,mark_task_cc3b0421_successful({'result': 'The ...,0.708468,0.0
8,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,mark_task_cdd351a0_successful({'result': 'The ...,0.755455,0.026316
9,(The following message is from the orchestrato...,[# Agent\n\nI am a workflow orchestrator. You ...,check_if_airport_exists({'aita_code': 'LAX'}),0.688922,0.0


# Agent Quality around Walks

In [4]:
# We want to make sure that our agent is routed to the right tasks.
# We can do this by analyzing the agent's walks.
query = cluster.analytics_query("""
    FROM
        `travel-sample`.agent_activity.Walks() t
    SELECT
        t;
""")
list(query)

[{'t': {'msgs': [{'content': {'extra': {'status': 'RUNNING'},
      'to_node': 'get_user_intent'},
     'kind': 'transition',
     'msg_num': 1,
     'timestamp': '2024-11-21T14:03:40.722499-08:00'},
    {'content': {'extra': {'status': 'SUCCESSFUL'},
      'from_node': 'get_user_intent'},
     'kind': 'transition',
     'msg_num': 10,
     'timestamp': '2024-11-21T14:03:54.443396-08:00'},
    {'content': {'extra': {'status': 'RUNNING'},
      'to_node': 'suggest_destination'},
     'kind': 'transition',
     'msg_num': 11,
     'timestamp': '2024-11-21T14:03:54.873974-08:00'},
    {'content': {'extra': {'status': 'SUCCESSFUL'},
      'from_node': 'suggest_destination'},
     'kind': 'transition',
     'msg_num': 30,
     'timestamp': '2024-11-21T14:07:07.620923-08:00'},
    {'content': {'extra': {'status': 'RUNNING'},
      'to_node': 'get_closest_airport'},
     'kind': 'transition',
     'msg_num': 31,
     'timestamp': '2024-11-21T14:07:08.285556-08:00'},
    {'content': {'extra': 

# Agent Quality around Tool Selection

In [5]:
# We want to assert that our agent is selecting the right tools for the right tasks.
# Again, we provide a view to help you enable this.
query = cluster.analytics_query("""
    FROM
        `travel-sample`.agent_activity.ToolCalls() t
    SELECT
        t;
""")
list(query)

[{'t': {'sid': 'df7895de19f746188ad44b685dd3110a',
   'tool_calls': [{'tool_args': {'message': 'Hello! How can I assist you today? Are you looking for help with travel rewards, trip planning, or do you have questions about our agency? If none of these apply, feel free to let me know what you need.'},
     'tool_call_id': 'call_lQ8u4ZzMEMsSjjXwWnfVvEGi',
     'tool_name': 'talk_to_user',
     'tool_result': '{"role":"human","content":"hi there, i want to plan a trip"}',
     'tool_status': 'success'},
    {'tool_args': {'message': 'Hello! How can I assist you today? Are you looking for help with travel rewards, trip planning, or do you have questions about our agency? If none of these apply, feel free to let me know what you need.'},
     'tool_call_id': 'call_lQ8u4ZzMEMsSjjXwWnfVvEGi',
     'tool_name': 'talk_to_user',
     'tool_result': '{"role":"human","content":"hi there, i want to plan a trip"}',
     'tool_status': 'success'},
    {'tool_args': {'result': 1},
     'tool_call_id':