# Agent Quality with RAGAS

In [8]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

In [9]:
import couchbase.auth
import couchbase.cluster
import couchbase.options
import datasets
import dotenv
import os

dotenv.load_dotenv()

conn_opts = couchbase.options.ClusterOptions(
    authenticator=couchbase.auth.PasswordAuthenticator(
        os.getenv("AGENT_CATALOG_USERNAME"), os.getenv("AGENT_CATALOG_PASSWORD")
    )
)
cluster = couchbase.cluster.Cluster.connect(os.getenv("AGENT_CATALOG_CONN_STRING"), conn_opts)
bucket_name = os.getenv("AGENT_CATALOG_BUCKET")

# We will treat the results of our tools as RAGAS context (we need to transpose our exchanges).
query = cluster.analytics_query(f"""
    WITH LatestExchanges AS (
        FROM
            `{bucket_name}`.agent_activity.Exchanges AS e
        LETTING
            question = e.question.content,
            answer = e.answer.content,
            contexts = (
                FROM
                    e.walk AS w
                WHERE
                    w.kind = "tool"
                SELECT VALUE
                    w.content.content
            )
        WHERE
            e.sid = `{bucket_name}`.agent_activity.LastSession() AND 
            IS_STRING(question) AND
            IS_STRING(answer) AND
            LEN(contexts) > 0
        SELECT
            question,
            answer,
            contexts,
            ROW_NUMBER() OVER () AS row_num
    )
    SELECT
        (FROM LatestExchanges e SELECT VALUE e.question ORDER BY e.row_num) AS question,
        (FROM LatestExchanges e SELECT VALUE e.answer ORDER BY e.row_num) AS answer,
        (FROM LatestExchanges e SELECT VALUE e.contexts ORDER BY e.row_num) AS contexts
""")
ragas_dataset = datasets.Dataset.from_dict(list(query)[0])
ragas_dataset

Dataset({
    features: ['answer', 'contexts', 'question'],
    num_rows: 1
})

In [10]:
import ragas.metrics

run_config = ragas.RunConfig()
ragas.metrics.faithfulness.init(run_config)
ragas.metrics.answer_relevancy.init(run_config)
ragas.metrics.context_utilization.init(run_config)
score = ragas.evaluate(
    ragas_dataset,
    metrics=[ragas.metrics.faithfulness, ragas.metrics.answer_relevancy, ragas.metrics.context_utilization],
)
score.to_pandas()

Evaluating: 100%|██████████| 3/3 [00:15<00:00,  5.14s/it]


Unnamed: 0,question,contexts,answer,faithfulness,answer_relevancy,context_utilization
0,i like beaches,"[{""role"":""human"",""content"":""i like beaches""}, ...","Based on your interest in beaches, here are so...",0.894737,0.865005,1.0


# Agent Quality around Walks

In [None]:
# We want to assert that our agent is routed to the right tasks.



# Agent Quality around Tool Selection

In [None]:
# We want to assert that our agent is selecting the right tools for the right tasks.