# Setup

In [None]:
%load_ext autoreload
%autoreload 2

Background Info:
1. Data source: downloaded data from https://chateval.org/dbdc5 and extracted under ../datasets/dialogue_breakdowns/
2. Data format: https://dbd-challenge.github.io/dbdc3/datasets#format-of-the-json-file 
3. Evaluation metrics: https://sites.google.com/site/dialoguebreakdowndetection4/evaluation-metrics?authuser=0

Note: In the DBDC annotations "O means not a breakdown, T possible breakdown, and X breakdown."

In [2]:
import json
import os
import random
from pathlib import Path

from tqdm import tqdm

from chat_checker.breakdown_detection.breakdown_detector import BreakdownIdentifier, GhasselBreakdownIdentifier, OurBreakdownIdentifier
from models.benchmark_dialogues import DBDCDialogue
from models.configs import BreakdownDetectionConfig
from dbdc_eval.reference_evaluator import compute_dbdc_scores as compute_dbdc_scores_reference
from dbdc_eval.breakdown_detection_evaluator import compute_dbdc_scores
from breakdown_dataset_loader import load_dataset, load_tested_dialogues

In [3]:
random.seed(42)

In [4]:
# Specify the challenge, language, and split to evaluate on below
challenge = "dbdc5"
lang = "ja"
split = "dev"

if challenge == "dbdc4":
    assert split == "eval"
elif challenge == "dbdc5":
    if lang == "ja":
        assert split == "dev"
else:
    raise ValueError(f"Split {split} not supported")


In [5]:
eval_base_dir = Path(f"./data/{challenge}_{lang}_{split}_subset/")
tested_subset_dir = eval_base_dir / "annotated_dialogues"
reference_dir = eval_base_dir / "reference_dialogues"
eval_dir = eval_base_dir / "eval_files"
os.makedirs(tested_subset_dir, exist_ok=True)
os.makedirs(reference_dir, exist_ok=True)
os.makedirs(eval_dir, exist_ok=True)

# Load Dataset Based on the defined split

In [None]:
# Load all json files within the directory
dbdc_split_dataset = load_dataset(challenge=challenge, split=split, lang=lang)

print(f"Loaded {len(dbdc_split_dataset)} dialogues")

In [None]:
print("Example dialogue:")
print(json.dumps(dbdc_split_dataset[0].model_dump(), indent=2, ensure_ascii=False))

# Create the subset for evaluation

In [8]:
# Specify whether to load existing samples and recompute existing annotations
load_existing_samples = True
recompute_existing_annotations = False
# Specify the number of new samples to sample and the maximum number of samples for evaluation
n_new_samples = 0
max_samples = 200

In [None]:
tested_samples: list[DBDCDialogue] = []
if load_existing_samples:
    tested_samples = load_tested_dialogues(challenge=challenge, split=split, lang=lang)
len(tested_samples)

In [None]:
new_samples = min(n_new_samples, len(dbdc_split_dataset) - len(tested_samples))
new_samples

In [11]:
# Sample new samples from the dataset excluding the already tested samples
tested_ids = {dialogue.dialogue_id for dialogue in tested_samples}
remaining_samples = [dialogue for dialogue in dbdc_split_dataset if dialogue.dialogue_id not in tested_ids]
new_samples = random.sample(remaining_samples, n_new_samples)

In [None]:
# Shuffle the tested samples before combining with new samples
random.shuffle(tested_samples)

subset_for_testing = new_samples + tested_samples
subset_for_testing = subset_for_testing[:max_samples]
len(subset_for_testing)

In [None]:
print([dialogue.dialogue_id for dialogue in subset_for_testing])

In [None]:
print("First dialogue from subset for testing:")
print(json.dumps(subset_for_testing[0].model_dump(), indent=2, ensure_ascii=False))

# Build Evaluation Variants

In [15]:
# Uncomment the models and breakdown identifiers you want to evaluate on
models = {
    'gpt-3.5': 'gpt-3.5-turbo-0125',
    "gpt-4o": "gpt-4o-2024-08-06",
    # "gpt-4": "gpt-4-0613"
    # "o3-mini": "o3-mini-2025-01-31",
    # "gpt-4-turbo": "gpt-4-turbo-2024-04-09",
    # "gemini-2.5-pro": "gemini/gemini-2.5-pro-preview-03-25"
    # "gemini-2.0-flash": "gemini/gemini-2.0-flash-001"
}

breakdown_identifiers: dict[str, BreakdownIdentifier] = {
    "ours": OurBreakdownIdentifier(),
    "ghassel": GhasselBreakdownIdentifier(),
    "ghassel-taxonomy": GhasselBreakdownIdentifier(use_breakdown_taxonomy=True),
}

te_inclusion = {
    "no-tes": False,
    # "with-tes": True,
}

In [None]:
eval_configs: list[BreakdownDetectionConfig] = []
for model_name, model_version in models.items():
    for breakdown_identifier_name, breakdown_identifier in breakdown_identifiers.items():
        for te_variant, include_te in te_inclusion.items():
            config = BreakdownDetectionConfig(
                key=f"{model_name}_{breakdown_identifier_name}_{te_variant}",
                model=model_version,
                breakdown_identifier=breakdown_identifier,
                include_task_oriented_errors=include_te,
            )
            eval_configs.append(config)

print(f"Total number of eval configs: {len(eval_configs)}")
print(f"Config keys:\n{[config.key for config in eval_configs]}")

In [17]:
# OPTIONAL: Filter the configs to evaluate on a specific subset of configs
# allowed_keys = ['gpt-3.5_ghassel-taxonomy_no-tes', 'gpt-4o_ours_no-tes', 'gpt-4o_ghassel_no-tes', 'gpt-4_ghassel_no-tes']
# eval_configs = [config for config in eval_configs if config.key in allowed_keys]
# print(f"Filtered to {len(eval_configs)} configs")
# print(f"Config keys:\n{[config.key for config in eval_configs]}")


# Generate the breakdown annotations with each config

In [None]:
from litellm import completion_cost

from chat_checker.models.dialogue import SpeakerRole
from chat_checker.utils.misc_utils import write_prompt_to_txt_file


for config in eval_configs:
    print(f"Collecting annotations for {config.key}...")
    config_dir = tested_subset_dir / config.key
    config_dir.mkdir(parents=True, exist_ok=True)
    first_debug_stored = False
    for i, dialogue in tqdm(enumerate(subset_for_testing)):
        chat_checker_dialogue = dialogue.to_chat_checker_dialogue()
        for k, turn in enumerate(chat_checker_dialogue.chat_history):
            if turn.role != SpeakerRole.DIALOGUE_SYSTEM:
                continue
            conversation_history = chat_checker_dialogue.chat_history[:k]
            last_bot_utterance = turn.content
            has_llm_label = dialogue.turns[k].llm_breakdown_annotations and dialogue.turns[k].llm_breakdown_annotations.get(config.key) is not None
            if has_llm_label and not recompute_existing_annotations:
                continue
            try:
                breakdown_info, prompt, model_response = config.breakdown_identifier.identify_breakdowns(
                    chat_history=conversation_history,
                    last_bot_utterance=last_bot_utterance,
                    is_task_oriented=config.include_task_oriented_errors,
                    llm_name=config.model,
                )
            except Exception as e:
                print(f"Error processing dialogue {dialogue.dialogue_id} at turn {k} with config {config}: {e}")
                # We simply skip this turn and continue to the next one (sometimes OpenAI refuses to answer {'refusal': "I'm sorry, I can't assist with that request."})
                continue
            if not dialogue.turns[k].llm_breakdown_annotations:
                dialogue.turns[k].llm_breakdown_annotations = {}
            dialogue.turns[k].llm_breakdown_annotations[config.key] = breakdown_info
            if k > 0 and not first_debug_stored:
                first_debug_stored = True
                write_prompt_to_txt_file(prompt, config_dir / "sample_0_prompt.txt")
                with open(
                    config_dir / "sample_0_model_response.json", "w", encoding="utf-8"
                ) as f:
                    json.dump(model_response.model_dump(), f, ensure_ascii=False, indent=2)
                cost = completion_cost(model_response)
                with open(
                    config_dir / "sample_0_response_cost.txt", "w", encoding="utf-8"
                ) as f:
                    f.write(f"Model response cost: {cost:.8f} USD\n")
            

        with open(tested_subset_dir / f"{dialogue.dialogue_id}.log.json", "w", encoding="utf-8") as f:
            json.dump(dialogue.model_dump(by_alias=True), f, indent=2, ensure_ascii=False)

# Evaluate the breakdown annotations against the ground truth

In [19]:
# Specify the configs to compute evaluation metrics for
configs_to_evaluate = [
    "gpt-3.5_ghassel_no-tes",
    "gpt-3.5_ghassel-taxonomy_no-tes",
    "gpt-3.5_ours_no-tes",
    "gpt-4o_ghassel_no-tes",
    "gpt-4o_ghassel-taxonomy_no-tes",
    "gpt-4o_ours_no-tes",
    # "o3-mini_ghassel_no-tes",
    # "o3-mini_ghassel-taxonomy_no-tes",
    # "o3-mini_ours_no-tes",
]
# configs_to_evaluate = [
#     "gemini-2.0-flash_ours_no-tes",
#     "gemini-2.0-flash_ghassel_no-tes",
#     "gemini-2.0-flash_ghassel-taxonomy_no-tes",
# ]
# configs_to_evaluate = [
#     "gpt-3.5_ghassel-taxonomy_no-tes",
#     "gpt-4o_ours_no-tes",
#     "gpt-4o_ghassel_no-tes",
# ]
# configs_to_evaluate = [
#     "gpt-4_ghassel_no-tes",
# ]


In [None]:
tested_samples = load_tested_dialogues(challenge=challenge, split=split, lang=lang)
len(tested_samples)

In [21]:
import shutil

from models.benchmark_dialogues import DBDCPredictionsDialogue


def compute_reference_scores(config_key: str):
    eval_samples: list[DBDCPredictionsDialogue] = []
    reference_samples: list[DBDCDialogue] = []
    for sample in tested_samples:
        try:
            eval_sample = sample.to_eval_prediction_dialogue(config_key)
            eval_samples.append(eval_sample)
            reference_samples.append(sample)
        except ValueError:
            # We simply skip 
            continue

    # Store the eval json files in a separate directory
    # Clear the eval_dir first
    if eval_dir.exists():
        shutil.rmtree(eval_dir)
    eval_dir.mkdir(parents=True, exist_ok=True)
    for sample in eval_samples:
        dialogue_id = sample.dialogue_id
        with open(eval_dir / f"{dialogue_id}.labels.json", "w") as f:
            json.dump(sample.model_dump(by_alias=True), f, indent=2, ensure_ascii=True)

    # Store the reference dialogues in a seperate directory
    # Clear the reference_dir first
    if reference_dir.exists():
        shutil.rmtree(reference_dir)
    reference_dir.mkdir(parents=True, exist_ok=True)
    for sample in reference_samples:
        dialogue_id = sample.dialogue_id
        with open(reference_dir / f"{dialogue_id}.log.json", "w") as f:
            json.dump(sample.model_dump(by_alias=True), f, indent=2, ensure_ascii=True)

    compute_dbdc_scores_reference(reference_dir.as_posix(), eval_dir.as_posix(), 0.0)


In [None]:
results = {}
for config_key in configs_to_evaluate:
    print(f"Scores for config: {config_key}")
    # print("Our Scores")
    res = compute_dbdc_scores(tested_samples, config_key)
    res.print_results()
    results[config_key] = res

    # Uncomment to compare with scores from original eval script
    # print("\nOriginal Eval Script Scores:")
    # compute_reference_scores(config_key)

    print("\n--------------------------\n")