# Failure Analysis

In [8]:
import json
import pandas as pd
from pathlib import Path

def attrib_failure(sim) -> str:
    """failures：
       - tool_misuse: action_match == False
       - reasoning_error: action_match == True 但 db_match == False
       - coordination_failure: others
    """
    ri = sim.get("reward_info", {}) or {}
    db = ri.get("db_check", {}) or {}
    db_match = db.get("db_match", False)

    if db_match:
        return ""

    action_checks = ri.get("action_checks", []) or []
    any_false = any(ac.get("action_match") is False for ac in action_checks if isinstance(ac, dict))
    any_true  = any(ac.get("action_match") is True  for ac in action_checks if isinstance(ac, dict))

    if any_false:
        return "tool_misuse"
    if any_true:
        return "reasoning_error"
    return "coordination_failure"

def response_quality(sim) -> str:
    """Quality：COMMUNICATE reward 1.0 or communicate_checks if met=True, Good，otherwise Needs work"""
    ri = sim.get("reward_info", {}) or {}
    rb = ri.get("reward_breakdown", {}) or {}
    if rb.get("COMMUNICATE") == 1.0:
        return "Good"
    checks = ri.get("communicate_checks", []) or []
    if any(isinstance(c, dict) and c.get("met") for c in checks):
        return "Good"
    return "Needs work"


def data_analysis(file_path, domain):
    json_path = file_path
    with open(json_path, "r") as f:
        data = json.load(f)


    sims = data["simulations"]

    rows = []
    
    for s in sims:
        ri = s.get("reward_info", {}) or {}
        db = ri.get("db_check", {}) or {}
        rows.append({
            "task_id": s.get("task_id"),
            "run_id": s.get("id"),
            "success": bool(db.get("db_match", False)),   # 最终环境状态是否匹配
            "reward": ri.get("reward"),
            "db_match": db.get("db_match"),
            "failure_attribution": attrib_failure(s),
            "response_quality": response_quality(s),
            "termination_reason": s.get("termination_reason"),
            "agent_cost": s.get("agent_cost"),
            "duration_sec": s.get("duration"),
        })

    df = pd.DataFrame(rows).sort_values("task_id").reset_index(drop=True)


    total = len(df)
    success_count = int(df["success"].sum())
    fail_count = total - success_count
    task_success_rate = round(success_count / total, 3) if total else 0.0
    failure_breakdown = df.loc[~df["success"], "failure_attribution"].value_counts().to_dict()
    
    print(f"== Tau2 {domain} Run Summary ==")
    print(f"Total tasks:       {total}")
    print(f"Success count:     {success_count}")
    print(f"Fail count:        {fail_count}")
    print(f"Task Success Rate: {task_success_rate}")
    print(f"Failure breakdown: {failure_breakdown}")
    print()


    out_dir = Path(json_path).parent
    succ_csv = out_dir / f"tau2_{domain}_success_tasks.csv"
    fail_csv = out_dir / f"tau2_{domain}_failed_tasks.csv"
    
    df_success = df[df["success"]].copy()
    df_fail = df[~df["success"]].copy()
    
    df_success.to_csv(succ_csv, index=False)
    df_fail.to_csv(fail_csv, index=False)
    
    print(f"Saved success tasks to: {succ_csv}")
    print(f"Saved failed tasks to:  {fail_csv}")


In [16]:
# airline k = 1
import os

file_name = "2025-10-26T13_25_35.302912_airline_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"

file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
print(file_path)  
print(os.path.exists(file_path))

data_analysis(file_path, "airline")

/Users/eggji/project/tau2-bench-with-grok/data/simulations/2025-10-26T13_25_35.302912_airline_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json
True
== Tau2 airline Run Summary ==
Total tasks:       50
Success count:     30
Fail count:        20
Task Success Rate: 0.6
Failure breakdown: {'tool_misuse': 18, 'reasoning_error': 2}

Saved success tasks to: /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_airline_success_tasks.csv
Saved failed tasks to:  /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_airline_failed_tasks.csv


In [17]:
# reatail k = 1
file_name = "2025-10-26T13_17_9.426568_retail_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"

file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
print(file_path)  
print(os.path.exists(file_path))

data_analysis(file_path, "retail")

/Users/eggji/project/tau2-bench-with-grok/data/simulations/2025-10-26T13_17_9.426568_retail_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json
True
== Tau2 retail Run Summary ==
Total tasks:       114
Success count:     77
Fail count:        37
Task Success Rate: 0.675
Failure breakdown: {'tool_misuse': 34, 'reasoning_error': 3}

Saved success tasks to: /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_retail_success_tasks.csv
Saved failed tasks to:  /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_retail_failed_tasks.csv


In [26]:
# telecom k = 1
file_name = "2025-10-26T15_21_31.267512_telecom_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"

file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
print(file_path)  
print(os.path.exists(file_path))

data_analysis(file_path, "telecom")

/Users/eggji/project/tau2-bench-with-grok/data/simulations/2025-10-26T15_21_31.267512_telecom_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json
True
== Tau2 telecom Run Summary ==
Total tasks:       114
Success count:     21
Fail count:        93
Task Success Rate: 0.184
Failure breakdown: {'reasoning_error': 60, 'tool_misuse': 33}

Saved success tasks to: /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_telecom_success_tasks.csv
Saved failed tasks to:  /Users/eggji/project/tau2-bench-with-grok/data/simulations/tau2_telecom_failed_tasks.csv


# Failure Examples

In [19]:
import json
from pathlib import Path
import pandas as pd


# ========= Helpers =========
def action_signature(name, args):
    """
    Minimal signature for comparing actions: (name, sorted argument keys).
    This avoids brittle comparisons on exact values while still catching wrong tool choice.
    """
    if not isinstance(args, dict):
        return (name, None)
    return (name, tuple(sorted(args.keys())))

def failure_attribution(sim_reward_info, exec_rows):
    """
    Heuristic failure attribution:
      - tool_misuse: any action_match == False
      - reasoning_error: all action_match True but db_match == False
      - coordination_failure: no actions matched and db_match == False
    """
    db_match = (sim_reward_info or {}).get("db_check", {}).get("db_match", None)
    if db_match is True:
        return ""
    action_matches = [r.get("action_match") for r in exec_rows]
    any_false = any(m is False for m in action_matches)
    any_true  = any(m is True  for m in action_matches)
    if any_false:
        return "tool_misuse"
    if any_true:
        return "reasoning_error"
    return "coordination_failure"

def main_task(json_path, task_id):
    # ========= Load JSON =========
    data = json.loads(Path(json_path).read_text())
    
    # Locate ground truth for TASK_ID
    task_spec = None
    for t in data.get("tasks", []):
        if str(t.get("id")) == str(task_id):
            task_spec = t
            break
    
    # Locate simulation (model execution) for TASK_ID
    sim_spec = None
    for s in data.get("simulations", []):
        if str(s.get("task_id")) == str(task_id):
            sim_spec = s
            break
    
    # ========= Build Ground Truth Table =========
    gt_rows = []
    if task_spec:
        for a in (task_spec.get("evaluation_criteria", {}) or {}).get("actions", []) or []:
            gt_rows.append({
                "source": "GROUND_TRUTH",
                "action_id": a.get("action_id"),
                "name": a.get("name"),
                "arguments": a.get("arguments"),
            })
    else:
        gt_rows.append({"source": "GROUND_TRUTH", "action_id": None, "name": "(not found)", "arguments": {}})
    gt_df = pd.DataFrame(gt_rows)
    
    # ========= Build Executed Actions Table =========
    exec_rows = []
    if sim_spec:
        rinfo = sim_spec.get("reward_info", {}) or {}
        for ac in rinfo.get("action_checks", []) or []:
            action = ac.get("action", {}) or {}
            exec_rows.append({
                "source": "EXECUTED",
                "action_id": action.get("action_id"),
                "name": action.get("name"),
                "arguments": action.get("arguments"),
                "action_match": ac.get("action_match"),
                "action_reward": ac.get("action_reward"),
            })
    else:
        exec_rows.append({"source": "EXECUTED", "action_id": None, "name": "(not found)", "arguments": {}, "action_match": None, "action_reward": None})
    ex_df = pd.DataFrame(exec_rows)
    
    # ========= Compute Missing Expected Actions =========
    expected_sigs = [action_signature(row["name"], row["arguments"]) for _, row in gt_df.iterrows()]
    executed_sigs = [action_signature(row["name"], row["arguments"]) for _, row in ex_df.iterrows()]
    
    missing = []
    for _, row in gt_df.iterrows():
        sig = action_signature(row["name"], row["arguments"])
        if sig not in executed_sigs:
            missing.append({
                "expected_but_missing": row["name"],
                "expected_args_keys": list((row["arguments"] or {}).keys())
            })
    missing_df = pd.DataFrame(missing) if missing else pd.DataFrame([{"expected_but_missing": "(none)"}])
    
    # ========= Status Row =========
    rinfo = (sim_spec or {}).get("reward_info", {}) or {}
    db_match = rinfo.get("db_check", {}).get("db_match", None)
    reward   = rinfo.get("reward", None)
    status_df = pd.DataFrame([{
        "task_id": str(task_id),
        "run_file": str(json_path),
        "db_match": db_match,
        "reward": reward,
        "failure_attribution": failure_attribution(rinfo, exec_rows),
        "missing_expected_actions_count": len(missing)
    }])
    
    # ========= Print / Save =========
    print("\n=== Task", task_id, "— Ground Truth (Expected Actions) ===")
    print(gt_df.to_string(index=False))
    
    print("\n=== Task", task_id, "— Model Execution (Action Checks) ===")
    print(ex_df.to_string(index=False))
    
    print("\n=== Task", task_id, "— Missing Expected Actions ===")
    print(missing_df.to_string(index=False))
    
    print("\n=== Task", task_id, "— Run Status ===")
    print(status_df.to_string(index=False))
    
    # Optional: export CSVs near the JSON
    out_dir = Path(json_path).parent
    gt_df.to_csv(out_dir / f"task_{task_id}_ground_truth.csv", index=False)
    ex_df.to_csv(out_dir / f"task_{task_id}_executed_actions.csv", index=False)
    missing_df.to_csv(out_dir / f"task_{task_id}_missing_expected_actions.csv", index=False)
    status_df.to_csv(out_dir / f"task_{task_id}_status.csv", index=False)
    
    print(f"\nSaved CSVs to: {out_dir}")


In [21]:
# retail task 100 for mistooling
file_name = "2025-10-26T13_17_9.426568_retail_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"
file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
task_id = "100"  # change to any task id you want to inspect (string or int ok)
main_task(file_path, task_id)



=== Task 100 — Ground Truth (Expected Actions) ===
      source action_id                         name                                                                                                                                                     arguments
GROUND_TRUTH     101_0   modify_pending_order_items {'order_id': '#W3295833', 'item_ids': ['8926329222', '5312063289'], 'new_item_ids': ['7160999700', '6956751343'], 'payment_method_id': 'credit_card_3261838'}
GROUND_TRUTH     101_1 return_delivered_order_items                                                                  {'order_id': '#W8488728', 'item_ids': ['5676696062'], 'payment_method_id': 'paypal_3650980'}

=== Task 100 — Model Execution (Action Checks) ===
  source action_id                         name                                                                                                                                                     arguments  action_match  action_reward
EXECUTED     101_0   modify_pen

In [25]:
# retail reasoning_error
file_name = "2025-10-26T13_17_9.426568_retail_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"
file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
task_id = "111"  # change to any task id you want to inspect (string or int ok)
main_task(file_path, task_id)


=== Task 111 — Ground Truth (Expected Actions) ===
      source action_id                         name                                                                                                                                                 arguments
GROUND_TRUTH     112_0   modify_pending_order_items                           {'order_id': '#W9810810', 'item_ids': ['1355937109'], 'new_item_ids': ['9949163720'], 'payment_method_id': 'gift_card_7252880'}
GROUND_TRUTH     112_1 modify_pending_order_address {'order_id': '#W3730488', 'address1': '555 Highland Drive', 'address2': 'Suite 872', 'city': 'New York', 'country': 'USA', 'state': 'NY', 'zip': '10116'}
GROUND_TRUTH     112_2   modify_pending_order_items                           {'order_id': '#W3730488', 'item_ids': ['2913673670'], 'new_item_ids': ['2216662955'], 'payment_method_id': 'gift_card_7252880'}

=== Task 111 — Model Execution (Action Checks) ===
  source action_id                         name                         

In [23]:
# airline task 18 for mistooling
file_name = "2025-10-26T13_25_35.302912_airline_llm_agent_grok-4-fast-reasoning_user_simulator_grok-4-fast-reasoning.json"
file_path = os.path.join(os.getcwd(), 'data/simulations', file_name)
task_id = "18"  # change to any task id you want to inspect (string or int ok)
main_task(file_path, task_id)


=== Task 18 — Ground Truth (Expected Actions) ===
      source action_id                       name                                                                                                                                                                                                                                                                                                    arguments
GROUND_TRUTH      18_0 update_reservation_flights                                                                                                     {'reservation_id': 'JG7FMM', 'cabin': 'economy', 'flights': [{'flight_number': 'HAT028', 'date': '2024-05-21'}, {'flight_number': 'HAT277', 'date': '2024-05-21'}], 'payment_id': 'credit_card_2929732'}
GROUND_TRUTH      18_1 update_reservation_flights {'reservation_id': '2FBBAH', 'cabin': 'economy', 'flights': [{'flight_number': 'HAT080', 'date': '2024-05-28'}, {'flight_number': 'HAT076', 'date': '2024-05-28'}, {'flight_number': 'HAT255', 'date'