# TriageFlow: Incident Triage Demo

Multi-agent incident triage with Domino GenAI tracing.

## Setup

Save your API key as a Domino user environment variable:
1. **Account Settings** → **User Environment Variables**
2. Add `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`

In [13]:
import os
import io
import sys
import yaml
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from datetime import datetime

## Select Provider & Vertical

Choose your LLM provider and industry vertical for sample incidents.

In [14]:
provider_dropdown = widgets.Dropdown(
    options=["openai", "anthropic"],
    value="openai",
    description="Provider:"
)

vertical_dropdown = widgets.Dropdown(
    options=[
        ("Financial Services", "financial_services"),
        ("Healthcare", "healthcare"),
        ("Energy", "energy"),
        ("Public Sector", "public_sector")
    ],
    value="financial_services",
    description="Vertical:"
)

display(provider_dropdown, vertical_dropdown)

Dropdown(description='Provider:', options=('openai', 'anthropic'), value='openai')

Dropdown(description='Vertical:', options=(('Financial Services', 'financial_services'), ('Healthcare', 'healt…

## Load Configuration

All prompts, model settings, and agent parameters are centralized in `config.yaml`.

In [15]:
with open("config.yaml") as f:
    config = yaml.safe_load(f)

provider = provider_dropdown.value
model = config["models"][provider]
print(f"Provider: {provider}\nModel: {model}")

Provider: openai
Model: gpt-4o-mini


## Initialize Client & Auto-Tracing

MLflow's `autolog()` automatically captures all LLM calls without additional instrumentation.

In [16]:
import mlflow

# Disable inline trace display in notebook
mlflow.tracing.disable_notebook_display()

if provider == "openai":
    from openai import OpenAI
    client = OpenAI()
    mlflow.openai.autolog()
else:
    from anthropic import Anthropic
    client = Anthropic()
    mlflow.anthropic.autolog()

print(f"Auto-tracing enabled for {provider}")

Auto-tracing enabled for openai


## Import Domino Tracing

- `add_tracing`: Decorator for capturing inputs, outputs, and evaluation metrics
- `DominoRun`: Context manager for aggregating metrics across multiple traces

In [17]:
from domino.agents.tracing import add_tracing
from domino.agents.logging import DominoRun

## Load Models, Agents, and Judges

In [18]:
from src.models import Incident, IncidentSource
from src.agents import classify_incident, assess_impact, match_resources, draft_response
from src.judges import judge_classification, judge_response, judge_triage

In [19]:
def pipeline_evaluator(span) -> dict:
    """Extract pre-computed metrics from pipeline outputs."""
    outputs = span.outputs or {}
    if not hasattr(outputs, "get"):
        return {}

    # Scores are pre-computed inside triage_incident
    return {
        "classification_confidence": outputs.get("classification_confidence", 0.5),
        "impact_score": outputs.get("impact_score", 5.0),
        "resource_match_score": outputs.get("resource_match_score", 0.5),
        "completeness_score": outputs.get("completeness_score", 0.5),
        "classification_judge_score": outputs.get("classification_judge_score", 3),
        "response_judge_score": outputs.get("response_judge_score", 3),
        "triage_judge_score": outputs.get("triage_judge_score", 3),
    }

## Define Traced Pipeline

The `@add_tracing` decorator creates a single trace tree per incident. Each agent runs as a nested span with:
- Function inputs and outputs
- LLM calls captured via autolog (showing span types like `ChatCompletion`)
- Evaluation metrics attached to the trace

In [20]:
@add_tracing(name="triage_incident", autolog_frameworks=[provider], evaluator=pipeline_evaluator)
def triage_incident(incident: Incident):
    """Run the 4-agent triage pipeline with LLM judges."""
    # Run agents
    classification = classify_incident(client, provider, model, incident, config)
    impact = assess_impact(client, provider, model, incident, classification, config)
    resources = match_resources(client, provider, model, classification, impact, config)
    response = draft_response(client, provider, model, incident, classification, impact, resources, config)

    # Convert to dicts for judges
    class_dict = classification.model_dump()
    impact_dict = impact.model_dump()
    resources_dict = resources.model_dump()
    response_dict = response.model_dump()
    primary = resources_dict.get("primary_responder", {})

    # Run judges inside trace context (updated signatures include model parameter)
    class_judge = judge_classification(client, provider, model, incident.description, class_dict)
    
    # judge_response now returns a list of evaluations
    resp_judges = judge_response(client, provider, model, incident.description, response_dict)
    if resp_judges:
        resp_judge = {"score": sum(r.get("score", 3) for r in resp_judges) / len(resp_judges)}
    else:
        resp_judge = {"score": 3}
    
    # judge_triage now takes a combined triage_output dict
    triage_output = {
        "classification": class_dict,
        "impact": impact_dict,
        "assignment": resources_dict,
        "response": response_dict
    }
    triage_judge = judge_triage(client, provider, model, incident.description, triage_output)

    return {
        "classification": classification,
        "impact": impact,
        "resources": resources,
        "response": response,
        # Metrics for evaluator
        "classification_confidence": class_dict.get("confidence", 0.5),
        "impact_score": impact_dict.get("impact_score", 5.0),
        "resource_match_score": primary.get("match_score", 0.5) if isinstance(primary, dict) else 0.5,
        "completeness_score": response_dict.get("completeness_score", 0.5),
        "classification_judge_score": class_judge.get("score", 3),
        "response_judge_score": resp_judge.get("score", 3),
        "triage_judge_score": triage_judge.get("score", 3),
    }

## Load Sample Incidents

Example incidents will be loaded from the vertical selected above.

In [21]:
vertical = vertical_dropdown.value
df = pd.read_csv(f"example-data/{vertical}.csv")
print(f"Loaded {len(df)} incidents from {vertical}")
df

Loaded 10 incidents from financial_services


Unnamed: 0,ticket_id,description,source,reporter,affected_system,initial_severity
0,FIN-2024-001,Trading platform experiencing intermittent ord...,monitoring,,Trading Platform,5
1,FIN-2024-002,Suspicious login attempts detected on wire tra...,automated_scan,,Wire Transfer System,5
2,FIN-2024-003,End-of-day reconciliation report shows $2.3M d...,monitoring,,Settlement System,4
3,FIN-2024-004,Customer mobile app crash rate spiked to 12% a...,user_report,App Support Team,Mobile Banking App,3
4,FIN-2024-005,SWIFT message queue backlog growing. Currently...,monitoring,,SWIFT Gateway,4
5,FIN-2024-006,Fraud detection system generating excessive fa...,user_report,Fraud Ops Team,Fraud Detection Engine,3
6,FIN-2024-007,Market data feed from Bloomberg showing stale ...,monitoring,,Market Data Feed,4
7,FIN-2024-008,ATM network in midwest region offline. 127 ATM...,user_report,ATM Operations,ATM Network,4
8,FIN-2024-009,Regulatory report CCAR-2024-Q4 generation fail...,monitoring,,Risk Analytics Platform,3
9,FIN-2024-010,Customer PII potentially exposed via misconfig...,automated_scan,,Customer API Gateway,5


In [22]:
def row_to_incident(row) -> Incident:
    return Incident(
        ticket_id=row["ticket_id"],
        description=row["description"],
        source=IncidentSource(row["source"]),
        reporter=row["reporter"] if pd.notna(row["reporter"]) else None,
        affected_system=row["affected_system"] if pd.notna(row["affected_system"]) else None,
        initial_severity=int(row["initial_severity"]) if pd.notna(row["initial_severity"]) else None
    )

incidents = [row_to_incident(row) for _, row in df.iterrows()]
print(f"Loaded {len(incidents)} incidents")

Loaded 10 incidents


## Run Triage Pipeline

`DominoRun` aggregates metrics across all traces in the batch via `custom_summary_metrics`. Supported aggregations: `mean`, `median`, `stdev`, `min`, `max`.

In [23]:
# Experiment and run naming
username = os.environ.get("DOMINO_USER_NAME", os.environ.get("USER", "demo_user"))
project_name = os.environ.get("DOMINO_PROJECT_NAME", "default")
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

experiment_name = f"tracing-{project_name}-{username}"
run_name = f"{vertical}-{username}-{timestamp}"

aggregated_metrics = [
    # Base metrics
    ("classification_confidence", "mean"),
    ("impact_score", "median"),
    ("resource_match_score", "mean"),
    ("completeness_score", "mean"),
    # Judge scores
    ("classification_judge_score", "mean"),
    ("response_judge_score", "mean"),
    ("triage_judge_score", "mean"),
]

print(f"Experiment: {experiment_name}")
print(f"Run: {run_name}")

Experiment: tracing-TriageFlow-GenAI-Tracing-andrea_lowe
Run: financial_services-andrea_lowe-20260222-005606


In [None]:
# Set MLflow experiment
mlflow.set_experiment(experiment_name)

results = []
run_id = None

with DominoRun(agent_config_path="config.yaml", custom_summary_metrics=aggregated_metrics) as run:
    # Set run name via MLflow
    mlflow.set_tag("mlflow.runName", run_name)
    run_id = run.info.run_id
    
    for incident in incidents:
        print(f"Processing {incident.ticket_id}...")
        
        result = triage_incident(incident)
        
        results.append({
            "ticket_id": incident.ticket_id,
            **result
        })
        print(f"  → {result['classification'].category.value} | Urgency: {result['classification'].urgency} | Impact: {result['impact'].impact_score}")
    
    # Suppress DominoRun exit messages
    _stdout = sys.stdout
    sys.stdout = io.StringIO()

sys.stdout = _stdout
print(f"\nProcessed {len(results)} incidents")

Processing FIN-2024-001...
  → performance | Urgency: 4 | Impact: 5.5
Processing FIN-2024-002...
  → security | Urgency: 5 | Impact: 7.5
Processing FIN-2024-003...


## Results Summary

In [None]:
summary = pd.DataFrame([{
    "Ticket": r["ticket_id"],
    "Category": r["classification"].category.value,
    "Urgency": r["classification"].urgency,
    "Impact": r["impact"].impact_score,
    "Responder": r["resources"].primary_responder.name,
    "SLA Met": r["resources"].sla_met
} for r in results])
summary

## Sample Communication

Each incident generates tailored communications for technical teams, management, and affected users.

In [None]:
sample = results[0]
print(f"Ticket: {sample['ticket_id']}\n")
for comm in sample["response"].communications:
    print(f"--- {comm.audience.upper()} ---")
    print(f"Subject: {comm.subject}")
    print(f"{comm.body[:300]}...\n")

## Ad Hoc Evaluations

Add evaluations after traces are generated using `search_traces()` to retrieve traces from the run and `log_evaluation()` to attach scores to specific traces.

In [None]:
#from domino.aisystems.tracing import search_traces, log_evaluation
from domino.agents.tracing import search_traces
from domino.agents.logging import log_evaluation


# Retrieve all traces from the run
traces = search_traces(run_id=run_id)

# Add custom evaluations to each trace based on triage results
for i, trace in enumerate(traces.data):
    result = results[i]
    
    # Compute combined quality score from judge evaluations
    combined_quality = (
        result["classification_judge_score"] +
        result["response_judge_score"] +
        result["triage_judge_score"]
    ) / 3
    
    # Flag high-urgency incidents that may need manual review
    needs_review = result["classification"].urgency >= 4 and result["impact"].impact_score >= 7
    
    log_evaluation(trace_id=trace.id, name="combined_quality_score", value=round(combined_quality, 2))
    log_evaluation(trace_id=trace.id, name="needs_manual_review", value=1.0 if needs_review else 0.0)

print(f"Added evaluations to {len(traces.data)} traces")

## Next Steps

Open **Domino Experiment Manager** to view:
- Execution flow across all 4 agents
- Inline evaluation metrics per trace
- Aggregated statistics across the batch