# Agent Quality with RAGAS

In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

In [4]:
import couchbase.auth
import couchbase.cluster
import couchbase.options
import datasets
import dotenv
import os

dotenv.load_dotenv()

conn_opts = couchbase.options.ClusterOptions(
    authenticator=couchbase.auth.PasswordAuthenticator(
        os.getenv("AGENT_CATALOG_USERNAME"), os.getenv("AGENT_CATALOG_PASSWORD")
    )
)
cluster = couchbase.cluster.Cluster.connect(os.getenv("AGENT_CATALOG_CONN_STRING"), conn_opts)
bucket_name = os.getenv("AGENT_CATALOG_BUCKET")

# We will treat the results of our tools as RAGAS context (we need to transpose our exchanges).
query = cluster.analytics_query(f"""
    WITH LatestExchanges AS (
        FROM
            `{bucket_name}`.agent_activity.Exchanges() AS e
        LETTING
            question = e.question.content,
            answer = e.answer.content,
            contexts = (
                FROM
                    e.walk AS w
                WHERE
                    w.kind = "tool"
                SELECT VALUE
                    w.content.content
            )
        WHERE
            IS_STRING(question) AND
            IS_STRING(answer) AND
            LEN(contexts) > 0
        SELECT
            question,
            answer,
            contexts,
            ROW_NUMBER() OVER () AS row_num
    )
    SELECT
        (FROM LatestExchanges e SELECT VALUE e.question ORDER BY e.row_num) AS question,
        (FROM LatestExchanges e SELECT VALUE e.answer ORDER BY e.row_num) AS answer,
        (FROM LatestExchanges e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts
""")
ragas_dataset = datasets.Dataset.from_dict(list(query)[0])
ragas_dataset

Dataset({
    features: ['answer', 'contexts', 'question'],
    num_rows: 1
})

In [4]:
import langchain_openai
import ragas.llms
import ragas.metrics

chat_model = langchain_openai.ChatOpenAI(model="gpt-4o")
evaluator_llm = ragas.llms.LangchainLLMWrapper(chat_model)
score = ragas.evaluate(
    ragas_dataset,
    metrics=[
        ragas.metrics.Faithfulness(llm=evaluator_llm),
        ragas.metrics.AnswerRelevancy(llm=evaluator_llm),
        ragas.metrics.ContextUtilization(llm=evaluator_llm),
    ],
)
score.to_pandas()

Evaluating: 100%|██████████| 3/3 [00:09<00:00,  3.27s/it]


Unnamed: 0,question,contexts,answer,faithfulness,answer_relevancy,context_utilization
0,i like beaches,"[{""role"":""human"",""content"":""i like beaches""}, ...","Based on your interest in beaches, here are so...",1.0,0.884883,0.5


# Agent Quality around Walks

In [4]:
# We want to assert that our agent is routed to the right tasks.

# Agent Quality around Tool Selection

In [5]:
# We want to assert that our agent is selecting the right tools for the right tasks.