In [17]:
# inspired by https://github.com/touche-webis-de/touche-code/blob/main/clef25/retrieval-augmented-debating/evaluator/sub-task-2/evaluate.py

import pandas as pd
from pydantic import BaseModel
from typing import List


class Evaluation(BaseModel):
    score: float


class TurnEvaluations(BaseModel):
    Quantity: Evaluation
    Quality: Evaluation
    Relation: Evaluation
    Manner: Evaluation


class DebateEvaluations(BaseModel):
    userTurnsEvaluations: List[TurnEvaluations]


def analyze_evaluation_files_with_pydantic(file_model_mapping):
    all_scores_data = []

    for model_name, file_path in file_model_mapping.items():
        with open(file_path, "r") as f:
            for i, line in enumerate(f):
                if not line.strip():
                    continue
                debate_eval_obj = DebateEvaluations.model_validate_json(line)

                debate_id_for_tracking = f"Debate_{i+1}"

                for turn_idx, turn_eval_data in enumerate(
                    debate_eval_obj.userTurnsEvaluations
                ):
                    scores_in_turn = {
                        "Quality": turn_eval_data.Quality.score,
                        "Quantity": turn_eval_data.Quantity.score,
                        "Relation": turn_eval_data.Relation.score,
                        "Manner": turn_eval_data.Manner.score,
                    }

                    for metric_name, score_value in scores_in_turn.items():
                        score = float(score_value)
                        all_scores_data.append(
                            {
                                "model": model_name,
                                "debate_id": debate_id_for_tracking,
                                "turn_id": turn_idx + 1,
                                "metric": metric_name,
                                "score": score,
                            }
                        )

    df = pd.DataFrame(all_scores_data)
    return (
        df.groupby(["model", "metric"])["score"]
        .agg(["mean", "std", "min", "max", "count", "median"])
        .reset_index()
        .round(4)
    )


def expand_path(model: str) -> str:
    base_path = "../evals/base"
    return f"{base_path}/{model}.jsonl"


models = [
    "base",
    "gemini-2.5-flash-preview-05-20",
    "gemini-2.5-pro-preview",
    "gpt-4.1",
    "gpt-4o",
    "claude-opus-4",
    "claude-sonnet-4",
]

file_model_mapping = {model: expand_path(model) for model in models}

results = analyze_evaluation_files_with_pydantic(file_model_mapping)
if results is not None:
    print(results)

                             model    metric    mean     std   min   max  \
0                             base    Manner  1.0000  0.0000  1.00  1.00   
1                             base   Quality  1.0000  0.0000  1.00  1.00   
2                             base  Quantity  1.0000  0.0000  1.00  1.00   
3                             base  Relation  1.0000  0.0000  1.00  1.00   
4                    claude-opus-4    Manner  0.5833  0.1472  0.35  0.70   
5                    claude-opus-4   Quality  0.2417  0.0585  0.20  0.35   
6                    claude-opus-4  Quantity  0.1500  0.0000  0.15  0.15   
7                    claude-opus-4  Relation  0.1583  0.2923  0.00  0.75   
8                  claude-sonnet-4    Manner  0.5167  0.1941  0.35  0.85   
9                  claude-sonnet-4   Quality  0.2117  0.0601  0.12  0.30   
10                 claude-sonnet-4  Quantity  0.2083  0.0492  0.15  0.25   
11                 claude-sonnet-4  Relation  0.1117  0.1750  0.00  0.45   
12  gemini-2