In [0]:
%pip install -U -qqqq "git+https://github.com/mlflow/mlflow.git@master" "https://ml-team-public-read.s3.us-west-2.amazonaws.com/wheels/rag-studio/staging/databricks_agents-0.8.1.dev0-py3-none-any.whl" "https://ml-team-public-read.s3.us-west-2.amazonaws.com/wheels/managed-evals/staging/databricks_managed_evals-latest-py3-none-any.whl" databricks-vectorsearch databricks-sdk[openai]  -r requirements.txt
%restart_python

In [0]:
import os
os.environ['RAG_EVAL_MAX_INPUT_ROWS'] = '2000'

In [0]:
import os

import mlflow
from mlflow.metrics.genai import make_genai_metric_from_prompt
from mlflow.models.resources import (
    DatabricksVectorSearchIndex,
    DatabricksServingEndpoint,
    DatabricksGenieSpace,
    DatabricksFunction
)
from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA
from mlflow.models.signature import ModelSignature, Schema
from mlflow.models.rag_signatures import StringResponse



## Generate synthetic data

In [0]:
docs_df = (
    spark.table("agents_demo.default.product_docs")
    .withColumnRenamed("indexed_doc", "content")
    .withColumnRenamed("product_id", "doc_uri")
)
display(docs_df)

In [0]:
from pyspark.sql.functions import col, to_json, struct, expr, lit
from databricks.agents.eval import generate_evals_df

# Optional guideline
guidelines = """
# Task Description
You are generating an evaluation dataset which will be used to test a customer analytics chatbot on its ability to answer business user's questions about our product catalog.

# Content Guidelines
- Address scenarios that customer support agents may face while helping customers understand our products.
- Address scenarios that business analysts, who aren't familar with all of our products, may have

# Example questions
- how to troubleshoot <some issue>?
- how many colors of <product> are there?
- what's the best product for <use case>?
- did we change <feature> recently?

# Style Guidelines
- Questions should be succinct, and human-like.

# Personas
- A business analyst
- A customer support agent
"""

# Generate 1 question for each document
synthetic_eval_data = generate_evals_df(
    docs=docs_df.limit(10),
    guidelines=guidelines, 
    num_questions_per_doc=1
)

display(synthetic_eval_data)

In [0]:
import managed_evals

managed_eval_delta_table = "agents_demo.default.managed_evaluation_set"

# Add synthetic evals
managed_evals.add_evals(
    evals=synthetic_eval_data, evals_table_name=managed_eval_delta_table
)

In [0]:
sme_ui_link = managed_evals.get_evals_link(
    evals_table_name=managed_eval_delta_table
)
displayHTML(
    f'<a href="{sme_ui_link}/review" target="_blank"><button style="color: white; background-color: #0073e6; padding: 10px 24px; cursor: pointer; border: none; border-radius: 4px;">SME Evaluation Set Review UI</button></a>'
)

In [0]:
import managed_evals

managed_eval_delta_table = "agents_demo.default.managed_evaluation_set"

# Create the managed evaluation set backend
managed_evals.create_evals_table(
    # Delta Table where the managed evaluation set is stored
    evals_table_name=managed_eval_delta_table,
    # Generations from the deployed agent is used to improve the SME-facing UX for review the evaluation set
    model_serving_endpoint_name="agents_agents_demo-default-product_docs_agent",
    # Note: The mode parameter will be removed in future versions and replaced with a single mode
    eval_mode="grading_notes",
)

In [0]:
#managed_evals.delete_evals_table(evals_table_name=managed_eval_delta_table)
