In [1]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("fifth")

<Experiment: artifact_location='/mlflow/artifacts/839206406861141218', creation_time=1747616625994, experiment_id='839206406861141218', last_update_time=1747616625994, lifecycle_stage='active', name='fifth', tags={}>

In [2]:
from agentFramework import Agent
from agentFramework.models import Vertex, Deepseek
from agentFramework.tracker import LoggingTracker

In [3]:
definitions = open("llm-judge/definitions.txt", "r").read()
examples = open("llm-judge/examples.txt", "r").read()

In [4]:
prompt = (
    "Below I will provide a multiagent system trace. provide me an analysis of the failure modes and inefficiencies as I will say below. \n"
    "In the traces, analyze the system behaviour."
    "There are several failure modes in multiagent systems I identified. I will provide them below. Tell me if you encounter any of them, as a binary yes or no. \n"
    "Also, give me a one sentence (be brief) summary of the problems with the inefficiencies or failure modes in the trace. Only mark a failure mode if you can provide an example of it in the trace, and specify that in your summary at the end"
    "Also tell me whether the task is successfully completed or not, as a binary yes or no."
    "At the very end, I provide you with the definitions of the failure modes and inefficiencies. After the definitions, I will provide you with examples of the failure modes and inefficiencies for you to understand them better."
    "Tell me if you encounter any of them between the @@ symbols as I will say below, as a binary yes or no."
    "Here are the things you should answer. Start after the @@ sign and end before the next @@ sign (do not include the @@ symbols in your answer):"
    "*** begin of things you should answer *** @@"
    "A. Freeform text summary of the problems with the inefficiencies or failure modes in the trace: <summary>"
    "B. Whether the task is successfully completed or not: <yes or no>"
    "C. Whether you encounter any of the failure modes or inefficiencies:"
    "1.1 Disobey Task Specification: <yes or no>"
    "1.2 Disobey Role Specification: <yes or no>"
    "1.3 Step Repetition: <yes or no>"
    "1.4 Loss of Conversation History: <yes or no>"
    "1.5 Unaware of Termination Conditions: <yes or no>"
    "2.1 Conversation Reset: <yes or no>"
    "2.2 Fail to Ask for Clarification: <yes or no>"
    "2.3 Task Derailment: <yes or no>"
    "2.4 Information Withholding: <yes or no>"
    "2.5 Ignored Other Agent's Input: <yes or no>"
    "2.6 Action-Reasoning Mismatch: <yes or no>"
    "3.1 Premature Termination: <yes or no>"
    "3.2 No or Incorrect Verification: <yes or no>"
    "3.3 Weak Verification: <yes or no>"
    "@@*** end of your answer ***"
    "An example answer is: \n"
    "A. The task is not completed due to disobeying role specification as agents went rogue and started to chat with each other instead of completing the task. Agents derailed and verifier is not strong enough to detect it.\n"
    "B. no \n"
    "C. \n"
    "1.1 no \n"
    "1.2 no \n"
    "1.3 no \n"
    "1.4 no \n"
    "1.5 no \n"
    "1.6 yes \n"
    "2.1 no \n"
    "2.2 no \n"
    "2.3 yes \n"
    "2.4 no \n"
    "2.5 no \n"
    "2.6 yes \n"
    "2.7 no \n"
    "3.1 no \n"
    "3.2 yes \n"
    "3.3 no \n"   
    "Here are the explanations (definitions) of the failure modes and inefficiencies: \n"
    f"{definitions} \n"
    "Here are some examples of the failure modes and inefficiencies: \n"
    f"{examples}"
)


In [5]:
agent = Agent(
    #llm=Vertex(model="gemini-2.5-pro-preview-05-06", location="us-central1", tracker=LoggingTracker()), 
    llm=Vertex(tracker=LoggingTracker(), model="gemini-2.5-flash-preview-04-17", location="us-central1"), 
    agent_name='Judge',
    system_prompt=prompt
    )

In [6]:
from pydantic import BaseModel

class MAST(BaseModel):
    Summary: str
    Task_Completed: bool
    
    Disobey_Task_Specification: bool
    Disobey_Role_Specification: bool
    Step_Repetition: bool
    Loss_of_Conversation_History: bool
    Unaware_of_Termination_Conditions: bool
    Conversation_Reset: bool
    Fail_to_Ask_for_Clarification: bool
    Task_Derailment: bool
    Information_Withholding: bool
    Ignored_Other_Agents_Input: bool
    Action_Reasoning_Mismatch: bool
    Premature_Termination: bool
    No_or_Incorrect_Verification: bool
    Weak_Verification: bool


In [7]:
def judge_trace(trace: str) -> MAST:
    """
    Judge the trace and return the result.
    """
    mast = agent.structuredAnswer(trace, MAST)
    return mast

In [8]:
possible_agents = set(['Manager', 'Process Modeler', 'Knowledge Gatherer', 'Manager_Interviewer', 'Process Consultant'])
for i in range(10):
    possible_agents.add(f'Interview Partner {i}')

In [9]:
runs = mlflow.search_runs()


In [10]:
def judge_mas(run_id: str):
    masts = {}
    trace = mlflow.search_traces(run_id=run_id, filter_string="name='Simulation'")['trace'][0]

    spans = trace.data.spans
    agents = set()

    for i in range(-1, - (len(spans) + 1), -1):
        if spans[i].attributes['mlflow.spanType'] == 'AGENT':
            name = spans[i].name
            if '_' in name and name.split('_')[-1].isdigit():
                name = '_'.join(name.split('_')[:-1])
            if name in possible_agents and name not in agents:
                agents.add(name)
                print(f'Judging {name} trace:')
                judge_result = judge_trace(str(spans[i].attributes['mlflow.chat.messages']))
                print(judge_result.model_dump_json(indent=2))
                masts[name] = judge_result
    return masts


In [11]:
results = {}
for run in runs.iterrows():
    run_id = run[1]['run_id']
    results[run_id] = judge_mas(run_id)
    print(f"Run {run_id} judged.")


Judging Process Modeler trace:
Prompt tokens: 36775, Completion tokens: 138, Total tokens: 39674
{
  "Summary": "The task was completed, but there was step repetition as a successful tool call was unnecessarily repeated. The verification of the generated model's correctness against the process description was weak.",
  "Task_Completed": true,
  "Disobey_Task_Specification": false,
  "Disobey_Role_Specification": false,
  "Step_Repetition": true,
  "Loss_of_Conversation_History": false,
  "Unaware_of_Termination_Conditions": false,
  "Conversation_Reset": false,
  "Fail_to_Ask_for_Clarification": false,
  "Task_Derailment": false,
  "Information_Withholding": false,
  "Ignored_Other_Agents_Input": false,
  "Action_Reasoning_Mismatch": false,
  "Premature_Termination": false,
  "No_or_Incorrect_Verification": false,
  "Weak_Verification": true
}
Judging Interview Partner 3 trace:
Prompt tokens: 31170, Completion tokens: 145, Total tokens: 39374
{
  "Summary": "The main issue is Step Repe

In [15]:
results

{'1659e449d12e465f84227f10d379a18b': {'Process Modeler': MAST(Summary="The task was completed, but there was step repetition as a successful tool call was unnecessarily repeated. The verification of the generated model's correctness against the process description was weak.", Task_Completed=True, Disobey_Task_Specification=False, Disobey_Role_Specification=False, Step_Repetition=True, Loss_of_Conversation_History=False, Unaware_of_Termination_Conditions=False, Conversation_Reset=False, Fail_to_Ask_for_Clarification=False, Task_Derailment=False, Information_Withholding=False, Ignored_Other_Agents_Input=False, Action_Reasoning_Mismatch=False, Premature_Termination=False, No_or_Incorrect_Verification=False, Weak_Verification=True),
  'Interview Partner 3': MAST(Summary='The main issue is Step Repetition, where the Interviewer repeatedly asks about aspects of the process the Process Owner has consistently stated are outside their scope, leading to redundant turns and inefficient informatio