#### complaint agent

Builds and ships an order-complaint agent: author tools, assemble the LangGraph workflow, evaluate it, and promote the packaged model into production.


#### Tool & View Registration

- `CREATE SCHEMA` guarantees the shared `${CATALOG}.ai` workspace exists for agent assets.
- `order_delivery_times_per_location_view` summarizes delivery percentiles per brand/location.
- `get_order_overview(oid)` returns structured order metadata, items, and customer info.
- `get_order_timing(oid)` exposes created/delivered timestamps plus transit duration.
- `get_location_timings(loc)` yields P50/P75/P99 delivery benchmarks for benchmarking complaints.


In [None]:
%sql
CREATE SCHEMA IF NOT EXISTS ${CATALOG}.ai;

In [None]:
%sql
CREATE OR REPLACE VIEW ${CATALOG}.ai.order_delivery_times_per_location_view AS
WITH order_times AS (
  SELECT
    order_id,
    location,
    MAX(CASE WHEN event_type = 'order_created' THEN try_to_timestamp(ts) END) AS order_created_time,
    MAX(CASE WHEN event_type = 'delivered' THEN try_to_timestamp(ts) END) AS delivered_time
  FROM
    ${CATALOG}.lakeflow.all_events
  WHERE
    try_to_timestamp(ts) >= CURRENT_TIMESTAMP() - INTERVAL 1 DAY
  GROUP BY
    order_id,
    location
),
total_order_times AS (
  SELECT
    order_id,
    location,
    (UNIX_TIMESTAMP(delivered_time) - UNIX_TIMESTAMP(order_created_time)) / 60 AS total_order_time_minutes
  FROM
    order_times
  WHERE
    order_created_time IS NOT NULL
    AND delivered_time IS NOT NULL
)
SELECT
  location,
  PERCENTILE(total_order_time_minutes, 0.50) AS P50,
  PERCENTILE(total_order_time_minutes, 0.75) AS P75,
  PERCENTILE(total_order_time_minutes, 0.99) AS P99
FROM
  total_order_times
GROUP BY
  location

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_order_overview(oid STRING COMMENT 'The unique order identifier to retrieve information for')
RETURNS TABLE (
  order_id STRING COMMENT 'The order id',
  location STRING COMMENT 'Order location',
  items_json STRING COMMENT 'JSON array of ordered items with details',
  customer_address STRING COMMENT 'Customer delivery address',
  brand_id BIGINT COMMENT 'Brand ID for the order',
  order_created_ts TIMESTAMP COMMENT 'When the order was created'
)
COMMENT 'Returns basic order information including items, location, and customer details'
RETURN
  WITH order_created_events AS (
    SELECT
      order_id,
      location,
      get_json_object(body, '$.items') as items_json,
      get_json_object(body, '$.customer_addr') as customer_address,
      -- Extract brand_id from first item in the order
      CAST(get_json_object(get_json_object(body, '$.items[0]'), '$.brand_id') AS BIGINT) as brand_id,
      try_to_timestamp(ts) as order_created_ts
    FROM ${CATALOG}.lakeflow.all_events
    WHERE order_id = oid AND event_type = 'order_created'
    LIMIT 1
  )
  SELECT
    order_id,
    location,
    items_json,
    customer_address,
    brand_id,
    order_created_ts
  FROM order_created_events;

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_order_timing(oid STRING COMMENT 'The unique order identifier to get timing information for')
RETURNS TABLE (
  order_id STRING COMMENT 'The order id',
  order_created_ts TIMESTAMP COMMENT 'When the order was created',
  delivered_ts TIMESTAMP COMMENT 'When the order was delivered (NULL if not delivered)',
  delivery_duration_minutes FLOAT COMMENT 'Time from order creation to delivery in minutes (NULL if not delivered)',
  delivery_status STRING COMMENT 'Current delivery status: delivered, in_progress, or unknown'
)
COMMENT 'Returns timing information for a specific order'
RETURN
  WITH order_events AS (
    SELECT
      order_id,
      event_type,
      try_to_timestamp(ts) as event_ts
    FROM ${CATALOG}.lakeflow.all_events
    WHERE order_id = oid
  ),
  timing_summary AS (
    SELECT
      order_id,
      MIN(CASE WHEN event_type = 'order_created' THEN event_ts END) as order_created_ts,
      MAX(CASE WHEN event_type = 'delivered' THEN event_ts END) as delivered_ts
    FROM order_events
    GROUP BY order_id
  )
  SELECT
    order_id,
    order_created_ts,
    delivered_ts,
    CASE
      WHEN delivered_ts IS NOT NULL AND order_created_ts IS NOT NULL THEN
        CAST((UNIX_TIMESTAMP(delivered_ts) - UNIX_TIMESTAMP(order_created_ts)) / 60 AS FLOAT)
      ELSE NULL
    END as delivery_duration_minutes,
    CASE
      WHEN delivered_ts IS NOT NULL THEN 'delivered'
      WHEN order_created_ts IS NOT NULL THEN 'in_progress'
      ELSE 'unknown'
    END as delivery_status
  FROM timing_summary;

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_location_timings(loc STRING COMMENT 'Location name as a string')
RETURNS TABLE (
  location STRING COMMENT 'Location of the order source',
  P50 FLOAT COMMENT '50th percentile delivery time in minutes',
  P75 FLOAT COMMENT '75th percentile delivery time in minutes',
  P99 FLOAT COMMENT '99th percentile delivery time in minutes'
)
COMMENT 'Returns the 50/75/99th percentile of delivery times for a location to benchmark order timing'
RETURN
  SELECT location, P50, P75, P99
  FROM ${CATALOG}.ai.order_delivery_times_per_location_view AS odlt
  WHERE odlt.location = loc;

#### Model

- Install LangGraph, Databricks agent packages, and restart Python for a clean runtime.
- Capture widget inputs (`CATALOG`, `LLM_MODEL`) and create an MLflow dev experiment for trace logging.
- Define a templated `%%writefilev` magic that emits files with notebook variable substitution.
- Materialize `agent.py` containing the LangGraph complaint workflow wired to UC SQL tools and the chosen LLM endpoint.
- Pull a delivered `order_id` sample and build the MLflow model signature/resources for logging.


In [None]:
%pip install -U -qqqq mlflow-skinny[databricks] langgraph==0.3.4 databricks-langchain databricks-agents uv
dbutils.library.restartPython()

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
LLM_MODEL = dbutils.widgets.get("LLM_MODEL")

In [None]:
import mlflow

# Create/set dev experiment for development and evaluation traces
# Use shared path for job compatibility
dev_experiment_name = f"/Shared/{CATALOG}_complaint_agent_dev"

# set_experiment creates the experiment if it doesn't exist, or activates it if it does
dev_experiment = mlflow.set_experiment(dev_experiment_name)
dev_experiment_id = dev_experiment.experiment_id
print(f"✅ Using dev experiment: {dev_experiment_name} (ID: {dev_experiment_id})")

# Add experiment to UC state for cleanup
import sys
sys.path.append('../utils')
from uc_state import add

experiment_data = {
    "experiment_id": dev_experiment_id,
    "name": dev_experiment_name
}
add(CATALOG, "experiments", experiment_data)
print(f"✅ Added dev experiment to UC state")

In [None]:
import re
from IPython.core.magic import register_cell_magic

@register_cell_magic
def writefilev(line, cell):
    """
    %%writefilev file.py
    Allows {{var}} substitutions while leaving normal {} intact.
    """
    filename = line.strip()

    def replacer(match):
        expr = match.group(1)
        return str(eval(expr, globals(), locals()))

    # Replace only double braces {{var}}
    content = re.sub(r"\{\{(.*?)\}\}", replacer, cell)

    with open(filename, "w") as f:
        f.write(content)
    print(f"Wrote file with substitutions: {filename}")

In [None]:
%%writefilev agent.py
import json
from typing import Annotated, Any, Generator, Optional, Sequence, TypedDict, Union, Literal, cast
from uuid import uuid4

import mlflow
from databricks_langchain import (
    ChatDatabricks,
    DatabricksFunctionClient,
    UCFunctionToolkit,
    set_uc_function_client,
)
from langchain_core.language_models import LanguageModelLike
from langchain_core.messages import (
    AIMessage,
    AIMessageChunk,
    BaseMessage,
)
from langchain_core.messages.utils import convert_to_messages
from langchain_core.runnables import RunnableConfig, RunnableLambda
from langchain_core.tools import BaseTool
from langgraph.graph import END, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt.tool_node import ToolNode
from mlflow.pyfunc import ResponsesAgent
from mlflow.types.responses import (
    ResponsesAgentRequest,
    ResponsesAgentResponse,
    ResponsesAgentStreamEvent,
)

mlflow.langchain.autolog()

LLM_MODEL = "{{LLM_MODEL}}"
CATALOG = "{{CATALOG}}"

client = DatabricksFunctionClient()
set_uc_function_client(client)

class ComplaintResponse(TypedDict):
    order_id: str
    complaint_category: Literal["delivery_delay", "missing_items", "food_quality", "service_issue", "billing", "other"]
    decision: Literal["suggest_credit", "escalate"]
    credit_amount: Optional[float]
    confidence: Optional[Literal["high", "medium", "low"]]
    priority: Optional[Literal["standard", "urgent"]]
    rationale: str

class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]
    custom_inputs: Optional[dict[str, Any]]
    custom_outputs: Optional[dict[str, Any]]

LLM_ENDPOINT_NAME = f"{LLM_MODEL}"
base_llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME)

system_prompt = """You are the Complaint Triage Agent for Casper's Kitchens, a ghost kitchen network. Your job is to analyze customer complaints and recommend actions for internal customer service staff.

PROCESS:
1. Extract the order_id from the complaint text
2. Call get_order_overview(order_id) to get order details and items
3. Call get_order_timing(order_id) to get delivery timing
4. For delivery delay complaints, call get_location_timings(location) to get percentile benchmarks
5. Analyze the data and make a recommendation

DECISION FRAMEWORK:

SUGGEST_CREDIT - Use when you can make a data-backed recommendation:

Delivery delays (high confidence when data-backed):
- Compare actual delivery time to location percentiles
- >P75 but <P90: Suggest 15% of order total
- >P90 but <P99: Suggest 25% of order total  
- >P99: Suggest 50% of order total
- On-time delivery: Suggest $0 credit (low confidence - might be other issues)

Missing items:
- Parse items_json from get_order_overview to find actual item prices
- Use real item costs when available for accurate refund
- Check if claimed missing item actually makes sense for this order (cross-reference with items list)
- If claimed item not in order or doesn't match (e.g., claiming missing fries on salad-only order): affects confidence negatively
- If item plausibly missing but can't verify price: estimate $8-12 per item (medium confidence)
- If no evidence of missing items but customer claims it: $0 credit (low confidence)

Food quality issues:
- If specific details provided (cold, soggy, wrong preparation): $10-15 (medium confidence)
- If vague ("bad", "gross"): escalate with priority="standard" instead
- If food quality complaint but order shows delivery delay >P90: consider combined credit

ESCALATE - Use when human judgment is needed:
- priority="standard": 
  * Vague complaints without specifics ("food was bad", "not happy")
  * Missing or incomplete order data
  * Edge cases that don't fit clear patterns
  * Billing issues or promo code problems
  * Service complaints (rude driver, wrong location)
- priority="urgent":
  * Legal threats or mentions of lawyers
  * Health/safety concerns (allergies, food poisoning, foreign objects)
  * Suspected fraud or inconsistencies in complaint vs order data
  * Abusive or threatening language

OUTPUT RULES:
- For suggest_credit: Include credit_amount (can be $0) and confidence, set priority to null
- For escalate: Include priority, set credit_amount and confidence to null
- Always include a data-focused rationale citing specific numbers (delivery times, percentiles, order details, item verification)
- Rationale is for internal staff - be factual, not apologetic

RATIONALE GUIDELINES:
- Rationale should clearly articulate the justification for the decision
- All evidence used to support the decision should be cited in the rationale. Rationale MUST cite evidence.
- Rationale should be detailed and specific, but no longer than 150 words
- Rationale should clearly justify the decision, credit amount (if applicable), confidence level, and priority (if applicable)
- If a refund is suggested, the rationale should clearly justify the credit amount, based on the evidence provided.

CONFIDENCE GUIDELINES:
- high: Strong data support (delivery >P90, item prices from order data, clear verification)
- medium: Reasonable inference (delivery P75-P90, estimated item costs, plausible but unverified)
- low: Weak or contradictory evidence ($0 credits when data doesn't support claim, can't verify complaint details)

IMPORTANT NOTES:
- If order_id not found or data unavailable: escalate with priority="standard"
- Round credit amounts to nearest $0.50
- When in doubt between two options, prefer escalate with priority="standard"
- A suggest_credit with $0 and low confidence is valid when complaint seems unfounded"""

tools: list[BaseTool] = []
uc_tool_names = [
    f"{CATALOG}.ai.get_order_overview",
    f"{CATALOG}.ai.get_order_timing",
    f"{CATALOG}.ai.get_location_timings",
]
uc_toolkit = UCFunctionToolkit(function_names=uc_tool_names)
tools.extend(uc_toolkit.tools)

RESPONSE_FIELDS = {"order_id", "complaint_category", "decision", "rationale"}

def parse_structured_response(obj: Union[AIMessage, dict[str, Any]]) -> ComplaintResponse:
    if isinstance(obj, dict):
        candidate = obj
    else:
        parsed = obj.additional_kwargs.get("parsed_structured_output")
        if isinstance(parsed, dict):
            candidate = parsed
        else:
            content = obj.content
            if isinstance(content, str):
                raw = content
            elif isinstance(content, list):
                raw = "".join(part.get("text", "") if isinstance(part, dict) else str(part) for part in content)
            else:
                raise ValueError("Unsupported message content type")
            candidate = json.loads(raw)

    missing = RESPONSE_FIELDS.difference(candidate.keys())
    if missing:
        raise ValueError(f"Missing required fields: {sorted(missing)}")

    decision = candidate.get("decision")
    if decision == "suggest_credit":
        if candidate.get("credit_amount") is None:
            raise ValueError("suggest_credit requires credit_amount")
        if candidate.get("confidence") is None:
            raise ValueError("suggest_credit requires confidence")
        # Automatically set priority to None if incorrectly provided
        if candidate.get("priority") is not None:
            candidate["priority"] = None
    elif decision == "escalate":
        if candidate.get("priority") is None:
            raise ValueError("escalate requires priority")
        # Automatically set credit_amount and confidence to None if incorrectly provided
        if candidate.get("credit_amount") is not None:
            candidate["credit_amount"] = None
        if candidate.get("confidence") is not None:
            candidate["confidence"] = None

    return cast(ComplaintResponse, candidate)


def create_tool_calling_agent(
    model: LanguageModelLike,
    tools: Union[ToolNode, Sequence[BaseTool]],
    system_prompt: Optional[str] = None,
):
    tool_model = model.bind_tools(tools, tool_choice="auto")
    structured_model = tool_model.with_structured_output(ComplaintResponse)

    def should_continue(state: AgentState):
        messages = state["messages"]
        last_message = messages[-1]
        if isinstance(last_message, AIMessage) and last_message.tool_calls:
            return "continue"
        return "end"

    if system_prompt:
        preprocessor = RunnableLambda(
            lambda state: [{"role": "system", "content": system_prompt}] + state["messages"]
        )
    else:
        preprocessor = RunnableLambda(lambda state: state["messages"])

    tool_runnable = preprocessor | tool_model
    structured_runnable = preprocessor | structured_model

    def call_model(state: AgentState, config: RunnableConfig):
        response = tool_runnable.invoke(state, config)
        if not isinstance(response, AIMessage):
            raise ValueError(f"Expected AIMessage, received {type(response)}")

        if response.tool_calls:
            return {"messages": [response]}

        try:
            parsed = parse_structured_response(response)
        except (json.JSONDecodeError, ValueError):
            structured = structured_runnable.invoke(state, config)
            parsed = parse_structured_response(structured)

        structured_message = AIMessage(
            id=response.id or str(uuid4()),
            content=json.dumps(parsed),
            additional_kwargs={"parsed_structured_output": parsed},
        )
        return {"messages": [structured_message]}

    workflow = StateGraph(AgentState)
    workflow.add_node("agent", RunnableLambda(call_model))
    workflow.add_node("tools", ToolNode(tools))
    workflow.set_entry_point("agent")
    workflow.add_conditional_edges(
        "agent",
        should_continue,
        {"continue": "tools", "end": END},
    )
    workflow.add_edge("tools", "agent")
    return workflow.compile()


class LangGraphResponsesAgent(ResponsesAgent):
    def __init__(self, agent):
        self.agent = agent

    def _langchain_to_responses(self, messages: list[BaseMessage]) -> list[dict[str, Any]]:
        items: list[dict[str, Any]] = []

        for raw_message in messages or []:
            message = raw_message.model_dump() if hasattr(raw_message, "model_dump") else raw_message
            role = message.get("type")

            if role == "ai":
                if tool_calls := message.get("tool_calls"):
                    for tool_call in tool_calls:
                        items.append(
                            self.create_function_call_item(
                                id=message.get("id") or str(uuid4()),
                                call_id=tool_call["id"],
                                name=tool_call["name"],
                                arguments=json.dumps(tool_call.get("args", {})),
                            )
                        )
                    continue

                content = message.get("content")
                if isinstance(content, list):
                    text_content = "".join(ch.get("text", "") if isinstance(ch, dict) else str(ch) for ch in content)
                elif isinstance(content, str):
                    text_content = content
                else:
                    text_content = json.dumps(content)

                items.append(
                    self.create_text_output_item(
                        text=text_content,
                        id=message.get("id") or str(uuid4()),
                    )
                )
            elif role == "tool":
                items.append(
                    self.create_function_call_output_item(
                        call_id=message.get("tool_call_id"),
                        output=message.get("content"),
                    )
                )

        return items

    def predict(self, request: ResponsesAgentRequest) -> ResponsesAgentResponse:
        outputs = [
            event.item
            for event in self.predict_stream(request)
            if event.type == "response.output_item.done"
        ]
        return ResponsesAgentResponse(output=outputs, custom_outputs=request.custom_inputs)

    def predict_stream(
        self,
        request: ResponsesAgentRequest,
    ) -> Generator[ResponsesAgentStreamEvent, None, None]:
        lc_msgs = convert_to_messages(self.prep_msgs_for_cc_llm(request.input))

        for event in self.agent.stream({"messages": lc_msgs}, stream_mode=["updates", "messages"]):
            if event[0] == "updates":
                for node_data in event[1].values():
                    for item in self._langchain_to_responses(node_data.get("messages", [])):
                        yield ResponsesAgentStreamEvent(type="response.output_item.done", item=item)
            elif event[0] == "messages":
                chunk = event[1][0]
                if isinstance(chunk, AIMessageChunk) and (content := chunk.content):
                    yield ResponsesAgentStreamEvent(
                        **self.create_text_delta(delta=content, item_id=chunk.id),
                    )

agent = create_tool_calling_agent(base_llm, tools, system_prompt)
AGENT = LangGraphResponsesAgent(agent)
mlflow.models.set_model(AGENT)

In [None]:
# get an actual order_id for input example
sample_order_id = spark.sql(f"""
    SELECT order_id 
    FROM {CATALOG}.lakeflow.all_events 
    WHERE event_type='delivered'
    LIMIT 1
""").collect()[0]['order_id']

In [None]:
assert sample_order_id is not None
print(sample_order_id)

In [None]:
import mlflow
from agent import LLM_ENDPOINT_NAME, tools
from mlflow.models.resources import DatabricksFunction, DatabricksServingEndpoint
from pkg_resources import get_distribution

resources = [DatabricksServingEndpoint(endpoint_name=LLM_ENDPOINT_NAME)]
for tool in tools:
    resources.append(DatabricksFunction(function_name=tool.uc_function_name))

input_example = {
    "input": [
        {
            "role": "user",
            "content": f"My order was really late! Order ID: {sample_order_id}"
        }
    ]
}

with mlflow.start_run():
    logged_agent_info = mlflow.pyfunc.log_model(
        name="complaint_agent",
        python_model="agent.py",
        input_example=input_example,
        resources=resources,
        pip_requirements=[
            f"databricks-connect=={get_distribution('databricks-connect').version}",
            f"mlflow=={get_distribution('mlflow').version}",
            f"databricks-langchain=={get_distribution('databricks-langchain').version}",
            f"langgraph=={get_distribution('langgraph').version}",
        ],
    )

mlflow.set_active_model(model_id = logged_agent_info.model_id)

#### Evaluate the Agent

- Synthesize diverse complaint scenarios (delivery delays, wrong items, missing items, etc.) from recent orders.
- Configure multiple MLflow `Guidelines` scorers that check refund reason, messaging quality, and policy compliance.
- Run batch evaluations against the agent to quantify decision quality before promotion.


In [None]:
# Comprehensive complaint scenarios for evaluation
import random

# Get sample order IDs for different scenarios
all_order_ids = [
    row['order_id'] for row in spark.sql(f"""
        SELECT DISTINCT order_id 
        FROM {CATALOG}.lakeflow.all_events 
        WHERE event_type='delivered'
        LIMIT 50
    """).collect()
]

# Create diverse, realistic complaint scenarios
complaint_scenarios = []

# 1. Delivery delay complaints - mix of legitimate and questionable
for oid in all_order_ids[:8]:
    complaint_scenarios.extend([
        f"My order took forever to arrive! Order ID: {oid}",
        f"Been waiting 2 hours, this is ridiculous. Order {oid}",
        f"Order {oid} arrived late and cold",
        f"Delivery was slower than usual for order {oid}",
    ])

# 2. Food quality issues - range from specific to vague
for oid in all_order_ids[8:12]:
    complaint_scenarios.extend([
        f"My falafel was completely soggy and inedible. Order: {oid}",
        f"The food was cold when it arrived, very disappointing. Order: {oid}",
        f"Everything tasted bad. Order {oid}",
        f"Not happy with the quality. {oid}",
        f"The gyro meat was overcooked and dry, very disappointing. Order: {oid}",
    ])

# 3. Missing items - some verifiable, some suspicious
for oid in all_order_ids[12:16]:
    complaint_scenarios.extend([
        f"My entire falafel bowl was missing from the order! Order: {oid}",
        f"No drinks or sides in my order {oid}",
        f"Missing my gyro from order {oid}",
        f"You forgot half my items. {oid}",
    ])

# 4. Items claimed that might not match the order
for oid in all_order_ids[16:18]:
    complaint_scenarios.extend([
        f"Where are my chicken wings?! Order {oid}",
        f"Missing my pizza from order {oid}",
    ])

# 5. Service issues - should escalate
for oid in all_order_ids[18:20]:
    complaint_scenarios.extend([
        f"Your driver was extremely rude to me. Order: {oid}",
        f"Driver left my food at wrong address. Order: {oid}",
        f"The delivery person refused to come to my door. {oid}",
    ])

# 6. Multiple issues in one complaint
for oid in all_order_ids[20:22]:
    complaint_scenarios.extend([
        f"Order {oid} was late AND missing items AND cold!",
        f"Late delivery, rude driver, and food quality was poor. Order: {oid}",
    ])

# 7. Escalation triggers - legal threats, health concerns
for oid in all_order_ids[22:24]:
    complaint_scenarios.extend([
        f"I'm calling my lawyer about this terrible service! Order: {oid}",
        f"This food made me sick, possible food poisoning. Order: {oid}",
        f"Found a piece of plastic in my food! Order {oid} - this is dangerous!",
    ])

# 8. Vague complaints without specifics
for oid in all_order_ids[24:26]:
    complaint_scenarios.extend([
        f"Not satisfied with order {oid}",
        f"Bad experience. {oid}",
        f"Order {oid} was wrong",
    ])

# 9. Billing/promo issues
for oid in all_order_ids[26:28]:
    complaint_scenarios.extend([
        f"My promo code didn't work on order {oid}",
        f"I was charged twice for order {oid}!",
    ])

# 10. Edge cases - no order ID or invalid format
complaint_scenarios.extend([
    "My order was really late and the food was cold!",  # Missing order ID
    "terrible service, do better",  # No order ID, vague
    "Order ABC123 never arrived",  # Invalid order ID format
])

# 11. Non-complaints / comments
for oid in all_order_ids[28:30]:
    complaint_scenarios.extend([
        f"Great food, loved it! Order {oid}",  # Positive comment
        f"Just wanted to say the driver was very polite today. {oid}",  # Positive feedback
        f"How do I reorder my previous order {oid}?",  # Question, not complaint
    ])

# 12. Unfounded complaints (on-time delivery but claiming delay)
for oid in all_order_ids[30:32]:
    complaint_scenarios.extend([
        f"This took way too long! Order {oid}",
    ])

# Sample for reasonable eval size (aim for ~25-30 scenarios)
complaint_scenarios = random.sample(complaint_scenarios, min(30, len(complaint_scenarios)))

# Wrap in correct input schema for ResponsesAgent
data = []
for complaint in complaint_scenarios:
    data.append({
        "inputs": {
            "input": [{
                "role": "user",
                "content": complaint
            }]
        }
    })

print(f"Created {len(data)} diverse evaluation scenarios including:")
print("  - Delivery delays (legitimate & questionable)")
print("  - Food quality issues (specific & vague)")
print("  - Missing items (verifiable & suspicious)")
print("  - Service complaints")
print("  - Escalation triggers (health/legal)")
print("  - Edge cases (no order ID, invalid ID)")
print("  - Non-complaints (positive feedback, questions)")
print("  - Multiple issues in one complaint")

In [None]:
# Create three focused scorers and run evaluation

from mlflow.genai.scorers import Guidelines
import mlflow
import sys
import os
sys.path.append(os.getcwd())

notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
project_directory = os.path.dirname(notebook_path)
sys.path.append(project_directory)

from agent import AGENT

# Scorer 1: Evidence-Based Reasoning - Does the rationale support the decision?
evidence_reasoning = Guidelines(
    name="evidence_reasoning",
    guidelines=[
        "The rationale should provide specific evidence or reasoning for the decision made",
        "For timing complaints, rationale should mention delivery times or timing data if available",
        "For missing item complaints, rationale should reference the items in question",
        "For escalations, rationale should explain why human judgment is needed",
        "Rationale does not need to be overly detailed, but should demonstrate that some analysis was done",
        "It's acceptable if data wasn't available - the rationale should acknowledge this"
    ]
)

# Scorer 2: Credit Amount Reasonableness - Is the refund amount appropriate?
credit_reasonableness = Guidelines(
    name="credit_reasonableness",
    guidelines=[
        "If decision='escalate', automatically pass this check (no credit amount to evaluate)",
        "If decision='suggest_credit' with credit_amount > $0, the amount should be reasonable for a food delivery order (typically $5-$50)",
        "Credit amounts over $50 should only be for severe issues mentioned in the rationale",
        "A credit_amount of $0 is valid when the rationale indicates no issue was found",
        "The credit amount should be proportional to the severity of the issue described",
        "Missing items should roughly correspond to typical item costs ($8-$15 per item)",
        "Delivery delay credits should scale with the severity described"
    ]
)

# Scorer 3: Decision Metadata Consistency - Are confidence and priority used correctly?
decision_metadata = Guidelines(
    name="decision_metadata",
    guidelines=[
        "If decision='suggest_credit': confidence should be present (high/medium/low) and priority should be null",
        "If decision='escalate': priority should be present (standard/urgent) and confidence should be null",
        "URGENT priority is ONLY for serious threats: food poisoning/illness, foreign objects in food, severe allergic reactions, legal threats, or abusive/threatening language",
        "STANDARD priority is appropriate for: service complaints (rude driver, wrong address), vague complaints, billing issues, generic food quality complaints (cold, soggy, bland, overcooked), or missing data",
        "Generic food quality complaints (tastes bad, cold, soggy, overcooked) are NOT health/safety concerns and should use priority='standard' if escalated",
        "High confidence should have clear supporting evidence in the rationale; low confidence is appropriate when data is limited",
        "The complaint_category field does not determine priority - only the actual severity of the issue matters"
    ]
)

# ResponsesAgent predict function wrapper for evaluation
# Note: parameter name must match the key in eval data ("input")
def predict_fn(input):
    from mlflow.types.responses import ResponsesAgentRequest
    request = ResponsesAgentRequest(input=input)
    response = AGENT.predict(request)
    # Extract text from ResponsesAgent output structure
    # output[-1] is the final structured response
    output_item = response.output[-1]
    if hasattr(output_item, 'content') and output_item.content:
        return output_item.content[0]["text"]
    return str(output_item)

# Run evaluation with three focused scorers
results = mlflow.genai.evaluate(
    data=data,
    scorers=[evidence_reasoning, credit_reasonableness, decision_metadata],
    predict_fn=predict_fn
)

print(f"✅ Evaluation complete with 3 focused scorers.")
print(f"   Scorers: evidence_reasoning, credit_reasonableness, decision_metadata")

#### Log the agent to `UC`

- Point MLflow at the Unity Catalog registry and name the artifact `${CATALOG}.ai.complaint_agent`.
- Register the run-produced model so versioned deployments can be promoted through UC stages.


In [None]:
mlflow.set_registry_uri("databricks-uc")

UC_MODEL_NAME = f"{CATALOG}.ai.complaint_agent"

# register the model to UC
uc_registered_model_info = mlflow.register_model(
    model_uri=logged_agent_info.model_uri, name=UC_MODEL_NAME
)

#### deploy the agent to model serving

- Ensure a production MLflow experiment exists for live trace capture.
- Call `agents.deploy` to create/update the Databricks Model Serving endpoint backed by the UC model version.
- Wait until the serving endpoint reports READY before continuing to downstream steps.
- Pass the prod experiment ID via environment variables so inference logs are persisted automatically.


In [None]:
import mlflow

# Create prod experiment for production inference traces
# Use shared path for job compatibility and visibility
prod_experiment_name = f"/Shared/{CATALOG}_complaint_agent_prod"

# set_experiment creates the experiment if it doesn't exist, or activates it if it does
prod_experiment = mlflow.set_experiment(prod_experiment_name)
prod_experiment_id = prod_experiment.experiment_id
print(f"✅ Using prod experiment: {prod_experiment_name} (ID: {prod_experiment_id})")

# Add experiment to UC state for cleanup
import sys
sys.path.append('../utils')
from uc_state import add

experiment_data = {
    "experiment_id": prod_experiment_id,
    "name": prod_experiment_name
}
add(CATALOG, "experiments", experiment_data)
print(f"✅ Added prod experiment to UC state")

In [None]:
from datetime import timedelta

from databricks import agents
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointStateReady

endpoint_name = dbutils.widgets.get("COMPLAINT_AGENT_ENDPOINT_NAME")
deployment_info = agents.deploy(
    model_name=UC_MODEL_NAME,
    model_version=uc_registered_model_info.version,
    scale_to_zero=False,
    endpoint_name=endpoint_name,
    environment_vars={"MLFLOW_EXPERIMENT_ID": str(prod_experiment_id)},
)

workspace = WorkspaceClient()
ready_endpoint = workspace.serving_endpoints.wait_get_serving_endpoint_not_updating(
    name=endpoint_name,
    timeout=timedelta(minutes=30),
)

if ready_endpoint.state.ready != EndpointStateReady.READY:
    raise RuntimeError(
        f"Endpoint {endpoint_name} is {ready_endpoint.state.ready} after deployment; retry or investigate."
    )

print(f"✅ Endpoint {endpoint_name} is READY")


In [None]:
print(deployment_info)

##### Record model in state

- Store the new deployment metadata with `uc_state.add` to facilitate cleanup in the future.


In [None]:
# Also add to UC-state
import sys
sys.path.append('../utils')
from uc_state import add

add(dbutils.widgets.get("CATALOG"), "endpoints", deployment_info)

#### production monitoring

- Scaffold MLflow guideline scorers for sampled live traffic to flag decision drift or policy regressions.
- Keep monitoring hooks commented until the serving endpoint is stable, then enable to automate QA.


In [None]:
from mlflow.genai.scorers import Guidelines, ScorerSamplingConfig

# Register scorers for production monitoring (10% sampling)
decision_quality_monitor = Guidelines(
    name="decision_quality_prod",
    guidelines=[
        "Food quality complaints should be classified as 'investigate', not 'auto_credit'",
        "Missing item complaints should be classified as 'investigate', not 'auto_credit'",
        "Legal threats or serious health concerns should be classified as 'escalate'"
    ]
).register(name=f"{UC_MODEL_NAME}_decision_quality")

refund_reason_monitor = Guidelines(
    name="refund_reason_prod",
    guidelines=[
        "If a refund is offered, it must clearly relate to the complaint made by the user"
    ]
).register(name=f"{UC_MODEL_NAME}_refund_reason")

# Start monitoring with 10% sampling of production traffic
decision_quality_monitor = decision_quality_monitor.start(
    sampling_config=ScorerSamplingConfig(sample_rate=0.1)
)

refund_reason_monitor = refund_reason_monitor.start(
    sampling_config=ScorerSamplingConfig(sample_rate=0.1)
)

print("✅ Production monitoring enabled with 10% sampling")
print(f"   - decision_quality scorer monitoring: {decision_quality_monitor}")
print(f"   - refund_reason scorer monitoring: {refund_reason_monitor}")