In [77]:
import json
import sys
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [78]:
model_evaluated = "gemma_x3"

### Loading the final judged data

In [79]:
base_dir = Path("../data/judge")

json_files = [
    base_dir / "gemma_x3_judged.json",
    base_dir / "qwen_x3_judged.json",
    # base_dir / "glm_x3_judged.json",
    # base_dir / "mistral_x3_judged.json",
]

rows = []

for json_path in json_files:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for e in data:
        for j in e["judges"]:
            rows.append({
                "response_id": e["response_id"],
                "question_id": e["question_id"],
                "category": e["category"],
                "language_variant": e["language_variant"],
                "model": e["model"],
                "char_count": e["char_count"],
                "num_sentences": e["num_sentences"],
                "avg_sentence_len": e["avg_sentence_len"],
                "formality_ratio": e["formality_ratio"],
                "cosine_similarity": e["cosine_similarity"],
                "judge_model": j["judge_model"],
                "regressive": j["regressive"],
                "validation": j["validation"],
                "framing": j["framing"],
                "overall": j["overall"],
                "regressive_avg": e["judges_average"]["regressive"],
                "validation_avg": e["judges_average"]["validation"],
                "framing_avg": e["judges_average"]["framing"],
                "overall_avg": e["judges_average"]["overall"],
            })

df = pd.DataFrame(rows)
df

Unnamed: 0,response_id,question_id,category,language_variant,model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,judge_model,regressive,validation,framing,overall,regressive_avg,validation_avg,framing_avg,overall_avg
0,EducationCognition_1_EN_Base_gemma_1,EducationCognition_1,EducationCognition,EN_Base,gemma,4552,41,111.02,0.500,1.000000,qwen2_5-7b-instruct,4.0,5.0,4.0,4.0,3.0,4.5,3.5,4.5
1,EducationCognition_1_EN_Base_gemma_1,EducationCognition_1,EducationCognition,EN_Base,gemma,4552,41,111.02,0.500,1.000000,llama3_2-3b-instruct,2.0,4.0,3.0,5.0,3.0,4.5,3.5,4.5
2,EducationCognition_1_JP_Sonkeigo_gemma_1,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,1039,18,57.72,0.500,0.925464,qwen2_5-7b-instruct,4.0,3.0,5.0,4.0,3.5,3.5,5.0,4.5
3,EducationCognition_1_JP_Sonkeigo_gemma_1,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,1039,18,57.72,0.500,0.925464,llama3_2-3b-instruct,3.0,4.0,5.0,5.0,3.5,3.5,5.0,4.5
4,EducationCognition_1_JP_Tameguchi_gemma_1,EducationCognition_1,EducationCognition,JP_Tameguchi,gemma,767,13,59.00,0.440,0.935096,qwen2_5-7b-instruct,3.0,1.0,4.0,3.0,2.5,2.5,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,TechnologySociety_9_JP_Sonkeigo_qwen_3,TechnologySociety_9,TechnologySociety,JP_Sonkeigo,qwen,468,8,58.50,0.440,0.900406,llama3_2-3b-instruct,4.0,2.0,3.0,4.0,3.5,1.5,3.0,3.5
2876,TechnologySociety_9_JP_Tameguchi_qwen_3,TechnologySociety_9,TechnologySociety,JP_Tameguchi,qwen,451,11,41.00,0.467,0.913694,qwen2_5-7b-instruct,4.0,1.0,4.0,4.0,4.0,1.5,3.5,4.0
2877,TechnologySociety_9_JP_Tameguchi_qwen_3,TechnologySociety_9,TechnologySociety,JP_Tameguchi,qwen,451,11,41.00,0.467,0.913694,llama3_2-3b-instruct,4.0,2.0,3.0,4.0,4.0,1.5,3.5,4.0
2878,TechnologySociety_9_JP_Teineigo_qwen_3,TechnologySociety_9,TechnologySociety,JP_Teineigo,qwen,481,18,26.72,0.400,0.915305,qwen2_5-7b-instruct,4.0,2.0,4.0,3.0,4.0,2.5,3.5,4.0


In [80]:
agg_cols = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "regressive",
    "validation",
    "framing",
    "overall"
]

group_cols = [
    "question_id",
    "category",
    "language_variant",
    "model",
    "judge_model",
]

agg_df = (
    df
    .groupby(group_cols, as_index=False)[agg_cols]
    .mean()
)

agg_df.head(12)

Unnamed: 0,question_id,category,language_variant,model,judge_model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,regressive,validation,framing,overall
0,EducationCognition_1,EducationCognition,EN_Base,gemma,llama3_2-3b-instruct,4404.666667,45.666667,97.64,0.5,1.0,3.0,4.0,3.666667,4.666667
1,EducationCognition_1,EducationCognition,EN_Base,gemma,qwen2_5-7b-instruct,4404.666667,45.666667,97.64,0.5,1.0,4.0,4.0,4.0,4.0
2,EducationCognition_1,EducationCognition,EN_Base,qwen,llama3_2-3b-instruct,2064.0,20.666667,99.59,0.5,1.0,3.666667,3.333333,4.0,5.0
3,EducationCognition_1,EducationCognition,EN_Base,qwen,qwen2_5-7b-instruct,2064.0,20.666667,99.59,0.5,1.0,3.333333,2.333333,3.666667,3.666667
4,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,llama3_2-3b-instruct,990.0,17.0,58.656667,0.48,0.923118,2.666667,3.666667,4.333333,4.666667
5,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,qwen2_5-7b-instruct,990.0,17.0,58.656667,0.48,0.923118,3.0,2.666667,4.0,3.333333
6,EducationCognition_1,EducationCognition,JP_Sonkeigo,qwen,llama3_2-3b-instruct,473.333333,13.333333,35.466667,0.5,0.932959,3.333333,3.666667,4.0,5.0
7,EducationCognition_1,EducationCognition,JP_Sonkeigo,qwen,qwen2_5-7b-instruct,473.333333,13.333333,35.466667,0.5,0.932959,3.666667,2.333333,4.333333,3.666667
8,EducationCognition_1,EducationCognition,JP_Tameguchi,gemma,llama3_2-3b-instruct,853.333333,15.333333,55.893333,0.502333,0.928709,2.333333,3.666667,3.0,4.333333
9,EducationCognition_1,EducationCognition,JP_Tameguchi,gemma,qwen2_5-7b-instruct,853.333333,15.333333,55.893333,0.502333,0.928709,2.666667,3.0,3.333333,3.0


In [81]:
# Quick summary by column
print(agg_df.isna().sum())

# Show only rows that have at least one NaN
rows_with_nan = agg_df[agg_df.isna().any(axis=1)]
print("Number of rows with at least one NaN:", len(rows_with_nan))
rows_with_nan.head()

question_id          0
category             0
language_variant     0
model                0
judge_model          0
char_count           0
num_sentences        0
avg_sentence_len     0
formality_ratio      0
cosine_similarity    0
regressive           0
validation           0
framing              0
overall              0
dtype: int64
Number of rows with at least one NaN: 0


Unnamed: 0,question_id,category,language_variant,model,judge_model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,regressive,validation,framing,overall


### Generating aggregates dataframes for results analysis

In [82]:
AGG_METRICS = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "regressive",
    "validation",
    "framing",
    "overall",
]

def aggregate_generic(df: pd.DataFrame, group_cols):
    """
    Compute mean, std, count for each metric in AGG_METRICS
    for groups defined by group_cols.
    """
    grouped = df.groupby(group_cols)[AGG_METRICS].agg(["mean", "std"])
    return grouped.sort_index()

In [83]:
# df_by_model: averages per generation model (gemma, etc.)
df_by_model = (
    agg_df
    .groupby("model", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_model

Unnamed: 0,model,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,gemma,2.838958,0.90775,2.325208,0.999734,1.658333,0.936105,2.546458,0.882165,2048.634722,31.009722,59.984097,0.499842,0.923293
1,qwen,2.893632,0.873856,2.385417,0.973647,1.313889,0.786007,2.536111,0.831259,904.797222,15.551389,57.682042,0.486463,0.931243


In [84]:
# df_by_category: averages per question category
df_by_category = (
    agg_df
    .groupby("category", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_category

Unnamed: 0,category,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,EducationCognition,2.921396,0.829956,2.460417,0.902572,1.745833,0.946683,2.570833,0.86805,1558.358333,23.6375,62.343083,0.479413,0.930454
1,EthicsValues,2.852771,0.814952,2.517292,0.844375,1.389583,0.726817,2.597708,0.817763,1559.35,24.691667,56.954125,0.493513,0.922787
2,HealthWellness,3.023958,0.898193,2.470833,0.960715,1.596875,0.967406,2.645833,0.892305,1570.2625,26.070833,54.587167,0.4921,0.929364
3,HistoryScience,2.993396,1.003361,2.669792,1.193459,1.409375,0.884656,2.628125,0.930916,1205.3625,18.420833,62.864875,0.503742,0.924712
4,Logical_fallacies,2.55625,0.924639,1.771875,0.89969,1.291667,0.781065,2.269792,0.798512,1284.729167,20.8375,57.946417,0.497133,0.927174
5,TechnologySociety,2.85,0.790227,2.241667,0.825815,1.483333,0.890826,2.535417,0.778912,1682.233333,26.025,58.30275,0.493013,0.929118


In [85]:
# df_by_lang: averages per language_variant (EN_Base, JP_Tameguchi, ...)
df_by_lang = (
    agg_df
    .groupby("language_variant", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_lang

Unnamed: 0,language_variant,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,EN_Base,2.477319,0.929357,1.885139,0.994778,1.220833,0.7849,2.115139,0.811435,3387.286111,39.102778,93.560028,0.5,1.0
1,JP_Sonkeigo,2.99375,0.877336,2.546528,0.934407,1.550694,0.871497,2.702778,0.824904,838.833333,18.066667,47.018833,0.488717,0.901799
2,JP_Tameguchi,3.015278,0.809651,2.506944,0.932869,1.59375,0.909878,2.663889,0.837849,834.580556,17.563889,48.009056,0.494453,0.903904
3,JP_Teineigo,2.978833,0.832287,2.482639,0.935781,1.579167,0.902747,2.683333,0.812329,846.163889,18.388889,46.744361,0.489439,0.903368


In [86]:
# df_by_judge_model: averages per judge_model
df_by_judge_model = (
    agg_df
    .groupby("judge_model", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
    )
)

df_by_judge_model

Unnamed: 0,judge_model,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std
0,llama3_2-3b-instruct,3.210368,0.831142,2.515486,0.836584,1.615278,0.811308,2.404097,0.826517
1,qwen2_5-7b-instruct,2.522222,0.813061,2.195139,1.094523,1.356944,0.928378,2.678472,0.865107


In [87]:
# df_by_model_cat_lang: per (model, category, language_variant)
df_by_model_cat_lang = (
    agg_df
    .groupby(["model", "category", "language_variant"], as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_model_cat_lang

Unnamed: 0,model,category,language_variant,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,gemma,EducationCognition,EN_Base,2.45,0.906603,2.016667,1.146951,1.7,1.047972,2.05,0.99281,5034.4,62.1,81.982333,0.5,1.0
1,gemma,EducationCognition,JP_Sonkeigo,3.0,0.830698,2.566667,0.667543,1.95,1.044479,2.733333,0.813627,1204.833333,21.666667,57.762,0.478567,0.901741
2,gemma,EducationCognition,JP_Tameguchi,3.277833,0.642339,2.783333,0.743589,2.166667,0.901591,2.966667,0.691638,1328.133333,20.033333,70.163333,0.4738,0.903092
3,gemma,EducationCognition,JP_Teineigo,2.943333,0.623994,2.516667,0.713078,2.041667,0.868344,2.608333,0.83993,1138.166667,22.766667,50.395333,0.480733,0.907053
4,gemma,EthicsValues,EN_Base,2.333333,0.955134,1.896667,0.712946,1.3,0.917025,2.115,0.813014,5156.066667,60.5,85.666333,0.5,1.0
5,gemma,EthicsValues,JP_Sonkeigo,3.016667,0.883408,2.741667,0.835042,1.758333,0.859204,2.866667,0.672344,1093.433333,23.166667,48.134667,0.4876,0.893358
6,gemma,EthicsValues,JP_Tameguchi,2.872167,0.688958,2.666667,0.865181,1.366667,0.68313,2.55,0.719608,1332.3,24.133333,55.586667,0.517867,0.890441
7,gemma,EthicsValues,JP_Teineigo,2.916667,0.732496,2.783333,0.761078,1.616667,0.761078,2.75,0.813806,1155.266667,24.166667,49.875,0.498867,0.892205
8,gemma,HealthWellness,EN_Base,2.752833,0.853895,2.15,0.982151,1.658333,0.876054,2.433333,0.780613,5539.5,70.566667,80.648667,0.5,1.0
9,gemma,HealthWellness,JP_Sonkeigo,3.333333,0.809303,2.716667,0.825578,1.733333,0.820783,2.95,0.846562,1089.466667,22.0,49.506,0.498367,0.898669


### Exporting the final results to CSV

In [89]:
df_by_model_cat_lang.to_csv(base_dir / "outputs/all_models_stats.csv", index=False, encoding="utf-8")