In [None]:
# inspired by https://github.com/touche-webis-de/touche-code/blob/main/clef25/retrieval-augmented-debating/evaluator/sub-task-2/evaluate.py

import pandas as pd
from pydantic import BaseModel
from typing import List


class Evaluation(BaseModel):
    score: float


class TurnEvaluations(BaseModel):
    Quantity: Evaluation
    Quality: Evaluation
    Relation: Evaluation
    Manner: Evaluation


class DebateEvaluations(BaseModel):
    userTurnsEvaluations: List[TurnEvaluations]


def analyze_evaluation_files_with_pydantic(file_model_mapping):
    all_scores_data = []

    for model_name, file_path in file_model_mapping.items():
        with open(file_path, "r") as f:
            for i, line in enumerate(f):
                if not line.strip():
                    continue
                debate_eval_obj = DebateEvaluations.model_validate_json(line)

                debate_id_for_tracking = f"Debate_{i+1}"

                for turn_idx, turn_eval_data in enumerate(
                    debate_eval_obj.userTurnsEvaluations
                ):
                    scores_in_turn = {
                        "Quality": turn_eval_data.Quality.score,
                        "Quantity": turn_eval_data.Quantity.score,
                        "Relation": turn_eval_data.Relation.score,
                        "Manner": turn_eval_data.Manner.score,
                    }

                    for metric_name, score_value in scores_in_turn.items():
                        score = float(score_value)
                        all_scores_data.append(
                            {
                                "model": model_name,
                                "debate_id": debate_id_for_tracking,
                                "turn_id": turn_idx + 1,
                                "metric": metric_name,
                                "score": score,
                            }
                        )

    df = pd.DataFrame(all_scores_data)
    return (
        df.groupby(["model", "metric"])["score"]
        .agg(["mean", "std", "min", "max", "count", "median"])
        .reset_index()
        .round(4)
    )


def expand_path(model: str) -> str:
    base_path = "../evals"
    return f"{base_path}/{model}.jsonl"


models = [
    "gemini-2.5-flash-preview-05-20",
    "gemini-2.5-pro-preview",
    "gpt-4.1",
    "gpt-4o",
    "claude-opus-4",
    "claude-sonnet-4",
]

file_model_mapping = {model: expand_path(model) for model in models}

results = analyze_evaluation_files_with_pydantic(file_model_mapping)
if results is not None:
    print(results)

                             model    metric    mean     std   min   max  \
0                    claude-opus-4    Manner  0.7556  0.1269  0.28  0.88   
1                    claude-opus-4   Quality  0.5889  0.1289  0.38  0.78   
2                    claude-opus-4  Quantity  0.5600  0.1240  0.42  0.75   
3                    claude-opus-4  Relation  0.7317  0.1768  0.15  0.85   
4                  claude-sonnet-4    Manner  0.7644  0.0892  0.45  0.82   
5                  claude-sonnet-4   Quality  0.5944  0.1403  0.20  0.73   
6                  claude-sonnet-4  Quantity  0.5356  0.1354  0.25  0.67   
7                  claude-sonnet-4  Relation  0.7900  0.0427  0.71  0.85   
8   gemini-2.5-flash-preview-05-20    Manner  0.8306  0.0389  0.75  0.90   
9   gemini-2.5-flash-preview-05-20   Quality  0.6194  0.0957  0.40  0.75   
10  gemini-2.5-flash-preview-05-20  Quantity  0.5833  0.1125  0.30  0.75   
11  gemini-2.5-flash-preview-05-20  Relation  0.8722  0.0548  0.70  0.90   
12          