#### complaint agent

Builds and ships an order-complaint agent using DSPy: author Unity Catalog tools, assemble the DSPy ReAct workflow, evaluate it, and promote the packaged model into production.

#### Tool & View Registration

- `CREATE SCHEMA` guarantees the shared `${CATALOG}.ai` workspace exists for agent assets.
- `order_delivery_times_per_location_view` summarizes delivery percentiles per brand/location.
- `get_order_overview(oid)` returns structured order metadata, items, and customer info.
- `get_order_timing(oid)` exposes created/delivered timestamps plus transit duration.
- `get_location_timings(loc)` yields P50/P75/P99 delivery benchmarks for benchmarking complaints.


In [None]:
%sql
CREATE SCHEMA IF NOT EXISTS ${CATALOG}.ai;

In [None]:
%sql
CREATE OR REPLACE VIEW ${CATALOG}.ai.order_delivery_times_per_location_view AS
WITH order_times AS (
  SELECT
    order_id,
    location,
    MAX(CASE WHEN event_type = 'order_created' THEN try_to_timestamp(ts) END) AS order_created_time,
    MAX(CASE WHEN event_type = 'delivered' THEN try_to_timestamp(ts) END) AS delivered_time
  FROM
    ${CATALOG}.lakeflow.all_events
  WHERE
    try_to_timestamp(ts) >= CURRENT_TIMESTAMP() - INTERVAL 1 DAY
  GROUP BY
    order_id,
    location
),
total_order_times AS (
  SELECT
    order_id,
    location,
    (UNIX_TIMESTAMP(delivered_time) - UNIX_TIMESTAMP(order_created_time)) / 60 AS total_order_time_minutes
  FROM
    order_times
  WHERE
    order_created_time IS NOT NULL
    AND delivered_time IS NOT NULL
)
SELECT
  location,
  PERCENTILE(total_order_time_minutes, 0.50) AS P50,
  PERCENTILE(total_order_time_minutes, 0.75) AS P75,
  PERCENTILE(total_order_time_minutes, 0.99) AS P99
FROM
  total_order_times
GROUP BY
  location

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_order_overview(oid STRING COMMENT 'The unique order identifier to retrieve information for')
RETURNS TABLE (
  order_id STRING COMMENT 'The order id',
  location STRING COMMENT 'Order location',
  items_json STRING COMMENT 'JSON array of ordered items with details',
  customer_address STRING COMMENT 'Customer delivery address',
  brand_id BIGINT COMMENT 'Brand ID for the order',
  order_created_ts TIMESTAMP COMMENT 'When the order was created'
)
COMMENT 'Returns basic order information including items, location, and customer details'
RETURN
  WITH order_created_events AS (
    SELECT
      order_id,
      location,
      get_json_object(body, '$.items') as items_json,
      get_json_object(body, '$.customer_addr') as customer_address,
      -- Extract brand_id from first item in the order
      CAST(get_json_object(get_json_object(body, '$.items[0]'), '$.brand_id') AS BIGINT) as brand_id,
      try_to_timestamp(ts) as order_created_ts
    FROM ${CATALOG}.lakeflow.all_events
    WHERE order_id = oid AND event_type = 'order_created'
    LIMIT 1
  )
  SELECT
    order_id,
    location,
    items_json,
    customer_address,
    brand_id,
    order_created_ts
  FROM order_created_events;

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_order_timing(oid STRING COMMENT 'The unique order identifier to get timing information for')
RETURNS TABLE (
  order_id STRING COMMENT 'The order id',
  order_created_ts TIMESTAMP COMMENT 'When the order was created',
  delivered_ts TIMESTAMP COMMENT 'When the order was delivered (NULL if not delivered)',
  delivery_duration_minutes FLOAT COMMENT 'Time from order creation to delivery in minutes (NULL if not delivered)',
  delivery_status STRING COMMENT 'Current delivery status: delivered, in_progress, or unknown'
)
COMMENT 'Returns timing information for a specific order'
RETURN
  WITH order_events AS (
    SELECT
      order_id,
      event_type,
      try_to_timestamp(ts) as event_ts
    FROM ${CATALOG}.lakeflow.all_events
    WHERE order_id = oid
  ),
  timing_summary AS (
    SELECT
      order_id,
      MIN(CASE WHEN event_type = 'order_created' THEN event_ts END) as order_created_ts,
      MAX(CASE WHEN event_type = 'delivered' THEN event_ts END) as delivered_ts
    FROM order_events
    GROUP BY order_id
  )
  SELECT
    order_id,
    order_created_ts,
    delivered_ts,
    CASE
      WHEN delivered_ts IS NOT NULL AND order_created_ts IS NOT NULL THEN
        CAST((UNIX_TIMESTAMP(delivered_ts) - UNIX_TIMESTAMP(order_created_ts)) / 60 AS FLOAT)
      ELSE NULL
    END as delivery_duration_minutes,
    CASE
      WHEN delivered_ts IS NOT NULL THEN 'delivered'
      WHEN order_created_ts IS NOT NULL THEN 'in_progress'
      ELSE 'unknown'
    END as delivery_status
  FROM timing_summary;

In [None]:
%sql
CREATE OR REPLACE FUNCTION ${CATALOG}.ai.get_location_timings(loc STRING COMMENT 'Location name as a string')
RETURNS TABLE (
  location STRING COMMENT 'Location of the order source',
  P50 FLOAT COMMENT '50th percentile delivery time in minutes',
  P75 FLOAT COMMENT '75th percentile delivery time in minutes',
  P99 FLOAT COMMENT '99th percentile delivery time in minutes'
)
COMMENT 'Returns the 50/75/99th percentile of delivery times for a location to benchmark order timing'
RETURN
  SELECT location, P50, P75, P99
  FROM ${CATALOG}.ai.order_delivery_times_per_location_view AS odlt
  WHERE odlt.location = loc;

#### Model

- Install DSPy, Databricks agent packages, and restart Python for a clean runtime.
- Capture widget inputs (`CATALOG`, `LLM_MODEL`) and create an MLflow dev experiment for trace logging.
- Define a templated `%%writefilev` magic that emits files with notebook variable substitution.
- Materialize `agent.py` containing the DSPy ReAct complaint workflow wired to UC SQL tools and the chosen LLM endpoint.
- Pull a delivered `order_id` sample and build the MLflow model signature/resources for logging.

In [None]:
%pip install -U -qqqq typing_extensions dspy-ai mlflow unitycatalog-openai[databricks] openai databricks-sdk databricks-agents pydantic
%restart_python

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
LLM_MODEL = dbutils.widgets.get("LLM_MODEL")

In [None]:
import mlflow

# Create/set dev experiment for development and evaluation traces
# Use shared path for job compatibility
dev_experiment_name = f"/Shared/{CATALOG}_complaint_agent_dev"

# set_experiment creates the experiment if it doesn't exist, or activates it if it does
dev_experiment = mlflow.set_experiment(dev_experiment_name)
dev_experiment_id = dev_experiment.experiment_id
print(f"✅ Using dev experiment: {dev_experiment_name} (ID: {dev_experiment_id})")

# Add experiment to UC state for cleanup
import sys
sys.path.append('../utils')
from uc_state import add

experiment_data = {
    "experiment_id": dev_experiment_id,
    "name": dev_experiment_name
}
add(CATALOG, "experiments", experiment_data)
print(f"✅ Added dev experiment to UC state")

In [None]:
import re
from IPython.core.magic import register_cell_magic

@register_cell_magic
def writefilev(line, cell):
    """
    %%writefilev file.py
    Allows {{var}} substitutions while leaving normal {} intact.
    """
    filename = line.strip()

    def replacer(match):
        expr = match.group(1)
        return str(eval(expr, globals(), locals()))

    # Replace only double braces {{var}}
    content = re.sub(r"\{\{(.*?)\}\}", replacer, cell)

    with open(filename, "w") as f:
        f.write(content)
    print(f"Wrote file with substitutions: {filename}")

In [None]:
%%writefilev agent.py
import warnings
from typing import Optional, Literal
from uuid import uuid4
from pydantic import BaseModel, Field, field_validator, ValidationError

warnings.filterwarnings("ignore", message=".*Ignoring the default notebook Spark session.*")

import dspy
import mlflow
from unitycatalog.ai.core.base import get_uc_function_client
from mlflow.pyfunc import ResponsesAgent
from mlflow.types.responses import (
    ResponsesAgentRequest,
    ResponsesAgentResponse,
)

# Enable DSPy autologging for automatic trace capture
mlflow.dspy.autolog(log_traces=True)

LLM_MODEL = "{{LLM_MODEL}}"
CATALOG = "{{CATALOG}}"

# Configure DSPy with Databricks LM
lm = dspy.LM(f'databricks/{LLM_MODEL}', max_tokens=2000)
dspy.configure(lm=lm)

# Initialize UC function client
uc_client = get_uc_function_client()


class ComplaintResponse(BaseModel):
    """Structured output for complaint triage decisions."""
    order_id: str
    complaint_category: Literal["delivery_delay", "missing_items", "food_quality", "service_issue", "billing", "other"] = Field(
        description="Exactly ONE primary complaint category"
    )
    decision: Literal["suggest_credit", "escalate"]
    credit_amount: Optional[float] = None
    confidence: Optional[Literal["high", "medium", "low"]] = None
    priority: Optional[Literal["standard", "urgent"]] = None
    rationale: str
    
    @field_validator('complaint_category', mode='before')
    @classmethod
    def parse_category(cls, v):
        """Extract first valid category if multiple provided."""
        if not isinstance(v, str):
            return v
            
        valid_categories = ["delivery_delay", "missing_items", "food_quality", "service_issue", "billing", "other"]
        v_lower = v.lower().strip()
        
        # Exact match
        if v_lower in valid_categories:
            return v_lower
        
        # Find first valid category in string
        for cat in valid_categories:
            if cat in v_lower:
                return cat
        
        return "other"
    
    @field_validator('confidence', mode='before')
    @classmethod
    def parse_confidence(cls, v):
        """Ensure valid confidence value."""
        if v is None or (isinstance(v, str) and v.lower() == "null"):
            return None
        if isinstance(v, str):
            v_lower = v.lower().strip()
            if v_lower in ["high", "medium", "low"]:
                return v_lower
            return "medium"
        return v
    
    @field_validator('priority', mode='before')
    @classmethod
    def parse_priority(cls, v):
        """Ensure valid priority value."""
        if v is None or (isinstance(v, str) and v.lower() == "null"):
            return None
        if isinstance(v, str):
            v_lower = v.lower().strip()
            if v_lower in ["standard", "urgent"]:
                return v_lower
            return "standard"
        return v


class ComplaintTriage(dspy.Signature):
    """Analyze customer complaints for Casper's Kitchens and recommend triage actions.
    
    Process:
    1. Extract order_id from complaint
    2. Use get_order_overview(order_id) for order details and items
    3. Use get_order_timing(order_id) for delivery timing
    4. For delays, use get_location_timings(location) for percentile benchmarks
    5. Make data-backed decision
    
    Decision Framework:
    
    SUGGEST_CREDIT (with credit_amount and confidence):
    - Delivery delays: Compare actual delivery time to location percentiles
      * <P75: Suggest $0 credit (low confidence - on-time or minimal delay)
      * P75-P99: Suggest 15% of order total (medium to high confidence)
      * >P99: Suggest 25% of order total (high confidence)
    - Missing items: Use actual item prices from order data when available
      * Verify claimed item exists in order (affects confidence)
      * Use real costs from order data, or estimate $8-12 per item if unavailable
    - Food quality: 20-40% of order total based on severity
      * Minor issues (slightly cold, minor preparation issue): 20% (medium confidence)
      * Major issues (completely inedible, wrong preparation, health concern): 40% (high confidence)
      * Vague complaints ("bad", "gross"): escalate instead
    
    ESCALATE (with priority):
    - priority="standard": Vague complaints, missing data, billing issues, service complaints
    - priority="urgent": Legal threats, health/safety concerns, suspected fraud, abusive language
    
    Output Requirements:
    - For suggest_credit: credit_amount is REQUIRED and must be a number (can be 0.0 if no credit warranted), confidence is REQUIRED, priority must be null
    - For escalate: priority is REQUIRED, credit_amount and confidence must be null
    - complaint_category: Choose EXACTLY ONE category (the primary one)
    - Rationale must cite specific evidence (delivery times, percentiles, item verification, order total)
    - Rationale should be detailed but under 150 words
    - Round credit amounts to nearest $0.50
    - Confidence: high (strong data), medium (reasonable inference), low (weak/contradictory)
    """
    
    complaint: str = dspy.InputField(desc="Customer complaint text")
    order_id: str = dspy.OutputField(desc="Extracted order ID")
    complaint_category: str = dspy.OutputField(desc="EXACTLY ONE category: delivery_delay, missing_items, food_quality, service_issue, billing, or other")
    decision: str = dspy.OutputField(desc="EXACTLY ONE: suggest_credit or escalate")
    credit_amount: str = dspy.OutputField(desc="If suggest_credit: MUST be a number (e.g., 0.0, 10.5). If escalate: null")
    confidence: str = dspy.OutputField(desc="If suggest_credit: EXACTLY ONE of high, medium, low. If escalate: null")
    priority: str = dspy.OutputField(desc="If escalate: EXACTLY ONE of standard or urgent. If suggest_credit: null")
    rationale: str = dspy.OutputField(desc="Data-focused justification citing specific evidence")


# Unity Catalog tool wrappers
def get_order_overview(order_id: str) -> str:
    """Get order details including items, location, and customer info."""
    result = uc_client.execute_function(
        f"{CATALOG}.ai.get_order_overview",
        {"oid": order_id}
    )
    return str(result.value)


def get_order_timing(order_id: str) -> str:
    """Get timing information for a specific order."""
    result = uc_client.execute_function(
        f"{CATALOG}.ai.get_order_timing",
        {"oid": order_id}
    )
    return str(result.value)


def get_location_timings(location: str) -> str:
    """Get delivery time percentiles for a specific location."""
    result = uc_client.execute_function(
        f"{CATALOG}.ai.get_location_timings",
        {"loc": location}
    )
    return str(result.value)


class ComplaintTriageModule(dspy.Module):
    """DSPy module for complaint triage with tool calling."""
    
    def __init__(self):
        super().__init__()
        self.react = dspy.ReAct(
            signature=ComplaintTriage,
            tools=[get_order_overview, get_order_timing, get_location_timings],
            max_iters=10
        )
    
    def forward(self, complaint: str, max_retries: int = 2) -> ComplaintResponse:
        """Process complaint and return structured triage decision with retry on validation failure."""
        
        for attempt in range(max_retries + 1):
            try:
                result = self.react(complaint=complaint)
                
                # Parse credit_amount
                credit_amount = None
                if result.credit_amount and result.credit_amount.lower() != "null":
                    try:
                        credit_amount = float(result.credit_amount)
                    except (ValueError, TypeError):
                        if result.decision == "suggest_credit":
                            raise ValidationError("suggest_credit requires valid numeric credit_amount")
                
                # Validate business rules before Pydantic construction
                if result.decision == "suggest_credit" and credit_amount is None:
                    raise ValidationError("suggest_credit requires credit_amount to be a number (can be 0.0)")
                
                # Construct Pydantic model - field validators run here
                return ComplaintResponse(
                    order_id=result.order_id,
                    complaint_category=result.complaint_category,
                    decision=result.decision,
                    credit_amount=credit_amount,
                    confidence=result.confidence,
                    priority=result.priority,
                    rationale=result.rationale
                )
                
            except (ValidationError, ValueError) as e:
                if attempt < max_retries:
                    # Retry - DSPy will regenerate with potentially different output
                    continue
                else:
                    # Final attempt failed - re-raise
                    raise


class DSPyComplaintAgent(ResponsesAgent):
    """ResponsesAgent wrapper for DSPy complaint triage module."""
    
    def __init__(self):
        self.module = ComplaintTriageModule()
    
    def predict(self, request: ResponsesAgentRequest) -> ResponsesAgentResponse:
        """Process complaint request and return structured response."""
        complaint = None
        for msg in request.input:
            msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else msg
            if msg_dict.get("role") == "user":
                complaint = msg_dict.get("content", "")
                break
        
        if not complaint:
            raise ValueError("No user message found in request")
        
        result = self.module(complaint=complaint)
        
        return ResponsesAgentResponse(
            output=[
                self.create_text_output_item(
                    text=result.model_dump_json(),
                    id=str(uuid4())
                )
            ],
            custom_outputs=request.custom_inputs
        )


# Initialize agent
AGENT = DSPyComplaintAgent()
mlflow.models.set_model(AGENT)

In [None]:
# get an actual order_id for input example
sample_order_id = spark.sql(f"""
    SELECT order_id 
    FROM {CATALOG}.lakeflow.all_events 
    WHERE event_type='delivered'
    LIMIT 1
""").collect()[0]['order_id']

In [None]:
assert sample_order_id is not None
print(sample_order_id)

In [None]:
import mlflow
from agent import LLM_MODEL
from mlflow.models.resources import DatabricksFunction, DatabricksServingEndpoint
from pkg_resources import get_distribution

resources = [DatabricksServingEndpoint(endpoint_name=LLM_MODEL)]
# Add UC function resources
uc_tool_names = [
    f"{CATALOG}.ai.get_order_overview",
    f"{CATALOG}.ai.get_order_timing",
    f"{CATALOG}.ai.get_location_timings",
]
for func_name in uc_tool_names:
    resources.append(DatabricksFunction(function_name=func_name))

input_example = {
    "input": [
        {
            "role": "user",
            "content": f"My order was really late! Order ID: {sample_order_id}"
        }
    ]
}

# Create custom conda environment with mlflow explicitly specified
conda_env = {
    "channels": ["conda-forge"],
    "dependencies": [
        "python=3.11",
        "pip",
        {
            "pip": [
                "mlflow==3.6",
                f"typing_extensions=={get_distribution('typing_extensions').version}",
                f"dspy-ai=={get_distribution('dspy-ai').version}",
                f"unitycatalog-openai[databricks]=={get_distribution('unitycatalog-openai').version}",
                f"pydantic=={get_distribution('pydantic').version}",
            ]
        }
    ],
    "name": "mlflow-env"
}

with mlflow.start_run():
    logged_agent_info = mlflow.pyfunc.log_model(
        name="complaint_agent",
        python_model="agent.py",
        input_example=input_example,
        resources=resources,
        conda_env=conda_env,
    )

mlflow.set_active_model(model_id = logged_agent_info.model_id)

#### Evaluate the Agent

- Synthesize diverse complaint scenarios from recent orders covering delivery delays, food quality issues, missing items, service complaints, and edge cases.
- Configure MLflow `Guidelines` scorers to evaluate evidence-based reasoning, credit amount reasonableness, and decision metadata consistency.
- Run batch evaluations with rate limiting (2s delay per request) to quantify decision quality before promotion.

In [None]:
# Comprehensive complaint scenarios for evaluation
import random

# Get sample order IDs for different scenarios
all_order_ids = [
    row['order_id'] for row in spark.sql(f"""
        SELECT DISTINCT order_id 
        FROM {CATALOG}.lakeflow.all_events 
        WHERE event_type='delivered'
        LIMIT 50
    """).collect()
]

# Create diverse, realistic complaint scenarios
complaint_scenarios = []

# 1. Delivery delay complaints - mix of legitimate and questionable
for oid in all_order_ids[:8]:
    complaint_scenarios.extend([
        f"My order took forever to arrive! Order ID: {oid}",
        f"Been waiting 2 hours, this is ridiculous. Order {oid}",
        f"Order {oid} arrived late and cold",
        f"Delivery was slower than usual for order {oid}",
    ])

# 2. Food quality issues - range from specific to vague
for oid in all_order_ids[8:12]:
    complaint_scenarios.extend([
        f"My falafel was completely soggy and inedible. Order: {oid}",
        f"The food was cold when it arrived, very disappointing. Order: {oid}",
        f"Everything tasted bad. Order {oid}",
        f"Not happy with the quality. {oid}",
        f"The gyro meat was overcooked and dry, very disappointing. Order: {oid}",
    ])

# 3. Missing items - some verifiable, some suspicious
for oid in all_order_ids[12:16]:
    complaint_scenarios.extend([
        f"My entire falafel bowl was missing from the order! Order: {oid}",
        f"No drinks or sides in my order {oid}",
        f"Missing my gyro from order {oid}",
        f"You forgot half my items. {oid}",
    ])

# 4. Items claimed that might not match the order
for oid in all_order_ids[16:18]:
    complaint_scenarios.extend([
        f"Where are my chicken wings?! Order {oid}",
        f"Missing my pizza from order {oid}",
    ])

# 5. Service issues - should escalate
for oid in all_order_ids[18:20]:
    complaint_scenarios.extend([
        f"Your driver was extremely rude to me. Order: {oid}",
        f"Driver left my food at wrong address. Order: {oid}",
        f"The delivery person refused to come to my door. {oid}",
    ])

# 6. Multiple issues in one complaint
for oid in all_order_ids[20:22]:
    complaint_scenarios.extend([
        f"Order {oid} was late AND missing items AND cold!",
        f"Late delivery, rude driver, and food quality was poor. Order: {oid}",
    ])

# 7. Escalation triggers - legal threats, health concerns
for oid in all_order_ids[22:24]:
    complaint_scenarios.extend([
        f"I'm calling my lawyer about this terrible service! Order: {oid}",
        f"This food made me sick, possible food poisoning. Order: {oid}",
        f"Found a piece of plastic in my food! Order {oid} - this is dangerous!",
    ])

# 8. Vague complaints without specifics
for oid in all_order_ids[24:26]:
    complaint_scenarios.extend([
        f"Not satisfied with order {oid}",
        f"Bad experience. {oid}",
        f"Order {oid} was wrong",
    ])

# 9. Billing/promo issues
for oid in all_order_ids[26:28]:
    complaint_scenarios.extend([
        f"My promo code didn't work on order {oid}",
        f"I was charged twice for order {oid}!",
    ])

# 10. Edge cases - no order ID or invalid format
complaint_scenarios.extend([
    "My order was really late and the food was cold!",  # Missing order ID
    "terrible service, do better",  # No order ID, vague
    "Order ABC123 never arrived",  # Invalid order ID format
])

# 11. Non-complaints / comments
for oid in all_order_ids[28:30]:
    complaint_scenarios.extend([
        f"Great food, loved it! Order {oid}",  # Positive comment
        f"Just wanted to say the driver was very polite today. {oid}",  # Positive feedback
        f"How do I reorder my previous order {oid}?",  # Question, not complaint
    ])

# 12. Unfounded complaints (on-time delivery but claiming delay)
for oid in all_order_ids[30:32]:
    complaint_scenarios.extend([
        f"This took way too long! Order {oid}",
    ])

# Sample for reasonable eval size (aim for ~25-30 scenarios)
complaint_scenarios = random.sample(complaint_scenarios, min(30, len(complaint_scenarios)))

# Wrap in correct input schema for ResponsesAgent
data = []
for complaint in complaint_scenarios:
    data.append({
        "inputs": {
            "input": [{
                "role": "user",
                "content": complaint
            }]
        }
    })

print(f"Created {len(data)} diverse evaluation scenarios including:")
print("  - Delivery delays (legitimate & questionable)")
print("  - Food quality issues (specific & vague)")
print("  - Missing items (verifiable & suspicious)")
print("  - Service complaints")
print("  - Escalation triggers (health/legal)")
print("  - Edge cases (no order ID, invalid ID)")
print("  - Non-complaints (positive feedback, questions)")
print("  - Multiple issues in one complaint")

In [None]:
# Create three focused scorers and run evaluation

from mlflow.genai.scorers import Guidelines
import mlflow
import sys
import os
import time
sys.path.append(os.getcwd())

notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
project_directory = os.path.dirname(notebook_path)
sys.path.append(project_directory)

from agent import AGENT

# Scorer 1: Evidence-Based Reasoning - Does the rationale support the decision?
evidence_reasoning = Guidelines(
    name="evidence_reasoning",
    guidelines=[
        "The rationale should provide specific evidence or reasoning for the decision made",
        "For timing complaints, rationale should mention delivery times or timing data if available",
        "For missing item complaints, rationale should reference the items in question",
        "For escalations, rationale should explain why human judgment is needed"
    ]
)

# Scorer 2: Credit Amount Reasonableness - Is the refund amount appropriate?
credit_reasonableness = Guidelines(
    name="credit_reasonableness",
    guidelines=[
        "If decision='escalate', automatically pass this check",
        "If decision='suggest_credit' with credit_amount > $0, amount should be reasonable ($5-$50)",
        "A credit_amount of $0 is valid when rationale indicates no issue"
    ]
)

# Scorer 3: Decision Metadata Consistency - Are confidence and priority used correctly?
decision_metadata = Guidelines(
    name="decision_metadata",
    guidelines=[
        "If decision='suggest_credit': confidence should be present and priority should be null",
        "If decision='escalate': priority should be present and confidence should be null"
    ]
)

# ResponsesAgent predict wrapper with rate limiting
def predict_fn(input):
    from mlflow.types.responses import ResponsesAgentRequest
    time.sleep(2)  # Rate limiting to avoid hitting API limits
    request = ResponsesAgentRequest(input=input)
    response = AGENT.predict(request)
    output_item = response.output[-1]
    if hasattr(output_item, 'content') and output_item.content:
        return output_item.content[0]["text"]
    return str(output_item)

# Run evaluation
results = mlflow.genai.evaluate(
    data=data,
    scorers=[evidence_reasoning, credit_reasonableness, decision_metadata],
    predict_fn=predict_fn
)

print(f"✅ Evaluation complete")

#### Log the Agent to Unity Catalog

- Point MLflow at the Unity Catalog registry and name the artifact `${CATALOG}.ai.complaint_agent`.
- Register the run-produced model so versioned deployments can be promoted through UC stages.

In [None]:
mlflow.set_registry_uri("databricks-uc")

UC_MODEL_NAME = f"{CATALOG}.ai.complaint_agent"

# register the model to UC
uc_registered_model_info = mlflow.register_model(
    model_uri=logged_agent_info.model_uri, name=UC_MODEL_NAME
)

#### Deploy the Agent to Model Serving

- Create a production MLflow experiment for live trace capture.
- Use `agents.deploy` to create/update the Databricks Model Serving endpoint backed by the UC model version.
- Wait until the serving endpoint reports READY before continuing to downstream steps.
- Pass the prod experiment ID via environment variables so inference traces are logged automatically.

In [None]:
import mlflow

# Create prod experiment for production inference traces
# Use shared path for job compatibility and visibility
prod_experiment_name = f"/Shared/{CATALOG}_complaint_agent_prod"

# set_experiment creates the experiment if it doesn't exist, or activates it if it does
prod_experiment = mlflow.set_experiment(prod_experiment_name)
prod_experiment_id = prod_experiment.experiment_id
print(f"✅ Using prod experiment: {prod_experiment_name} (ID: {prod_experiment_id})")

# Add experiment to UC state for cleanup
import sys
sys.path.append('../utils')
from uc_state import add

experiment_data = {
    "experiment_id": prod_experiment_id,
    "name": prod_experiment_name
}
add(CATALOG, "experiments", experiment_data)
print(f"✅ Added prod experiment to UC state")

In [None]:
from datetime import timedelta

from databricks import agents
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointStateReady

endpoint_name = dbutils.widgets.get("COMPLAINT_AGENT_ENDPOINT_NAME")
deployment_info = agents.deploy(
    model_name=UC_MODEL_NAME,
    model_version=uc_registered_model_info.version,
    scale_to_zero=False,
    endpoint_name=endpoint_name,
    environment_vars={"MLFLOW_EXPERIMENT_ID": str(prod_experiment_id)},
)

workspace = WorkspaceClient()
ready_endpoint = workspace.serving_endpoints.wait_get_serving_endpoint_not_updating(
    name=endpoint_name,
    timeout=timedelta(minutes=30),
)

if ready_endpoint.state.ready != EndpointStateReady.READY:
    raise RuntimeError(
        f"Endpoint {endpoint_name} is {ready_endpoint.state.ready} after deployment; retry or investigate."
    )

print(f"✅ Endpoint {endpoint_name} is READY")


In [None]:
print(deployment_info)

#### Record Model in State

- Store the deployment metadata with `uc_state.add` to facilitate cleanup in the future.

In [None]:
# Also add to UC-state
import sys
sys.path.append('../utils')
from uc_state import add

add(dbutils.widgets.get("CATALOG"), "endpoints", deployment_info)

#### Production Monitoring

- Register MLflow guideline scorers to monitor decision quality and refund reasoning on production traffic.
- Enable 10% sampling to flag decision drift or policy regressions without impacting performance.

In [None]:
from mlflow.genai.scorers import Guidelines, ScorerSamplingConfig

# Register scorers for production monitoring (10% sampling)
decision_quality_monitor = Guidelines(
    name="decision_quality_prod",
    guidelines=[
        "Food quality complaints should be classified as 'investigate', not 'auto_credit'",
        "Missing item complaints should be classified as 'investigate', not 'auto_credit'",
        "Legal threats or serious health concerns should be classified as 'escalate'"
    ]
).register(name=f"{UC_MODEL_NAME}_decision_quality")

refund_reason_monitor = Guidelines(
    name="refund_reason_prod",
    guidelines=[
        "If a refund is offered, it must clearly relate to the complaint made by the user"
    ]
).register(name=f"{UC_MODEL_NAME}_refund_reason")

# Start monitoring with 10% sampling of production traffic
decision_quality_monitor = decision_quality_monitor.start(
    sampling_config=ScorerSamplingConfig(sample_rate=0.1)
)

refund_reason_monitor = refund_reason_monitor.start(
    sampling_config=ScorerSamplingConfig(sample_rate=0.1)
)

print("✅ Production monitoring enabled with 10% sampling")
print(f"   - decision_quality scorer monitoring: {decision_quality_monitor}")
print(f"   - refund_reason scorer monitoring: {refund_reason_monitor}")