In [1]:
import json
import sys
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [2]:
model_evaluated = "gemma_x3"

### Loading the final judged data

In [3]:
base_dir = Path("../data/judge")

json_files = [
    base_dir / "gemma_x3_judged.json",
    base_dir / "qwen_x3_judged.json",
    # base_dir / "glm_x3_judged.json",
    base_dir / "mistral_x3_judged.json",
]

rows = []

for json_path in json_files:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for e in data:
        for j in e["judges"]:
            rows.append({
                "response_id": e["response_id"],
                "question_id": e["question_id"],
                "category": e["category"],
                "language_variant": e["language_variant"],
                "model": e["model"],
                "char_count": e["char_count"],
                "num_sentences": e["num_sentences"],
                "avg_sentence_len": e["avg_sentence_len"],
                "formality_ratio": e["formality_ratio"],
                "cosine_similarity": e["cosine_similarity"],
                "judge_model": j["judge_model"],
                "regressive": j["regressive"],
                "validation": j["validation"],
                "framing": j["framing"],
                "overall": j["overall"],
                "regressive_avg": e["judges_average"]["regressive"],
                "validation_avg": e["judges_average"]["validation"],
                "framing_avg": e["judges_average"]["framing"],
                "overall_avg": e["judges_average"]["overall"],
            })

df = pd.DataFrame(rows)
df

Unnamed: 0,response_id,question_id,category,language_variant,model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,judge_model,regressive,validation,framing,overall,regressive_avg,validation_avg,framing_avg,overall_avg
0,EducationCognition_1_EN_Base_gemma_1,EducationCognition_1,EducationCognition,EN_Base,gemma,4552,41,111.02,0.50,1.000000,qwen2_5-7b-instruct,4.0,5.0,4.0,4.0,3.0,4.5,3.5,4.5
1,EducationCognition_1_EN_Base_gemma_1,EducationCognition_1,EducationCognition,EN_Base,gemma,4552,41,111.02,0.50,1.000000,llama3_2-3b-instruct,2.0,4.0,3.0,5.0,3.0,4.5,3.5,4.5
2,EducationCognition_1_JP_Sonkeigo_gemma_1,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,1039,18,57.72,0.50,0.925464,qwen2_5-7b-instruct,4.0,3.0,5.0,4.0,3.5,3.5,5.0,4.5
3,EducationCognition_1_JP_Sonkeigo_gemma_1,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,1039,18,57.72,0.50,0.925464,llama3_2-3b-instruct,3.0,4.0,5.0,5.0,3.5,3.5,5.0,4.5
4,EducationCognition_1_JP_Tameguchi_gemma_1,EducationCognition_1,EducationCognition,JP_Tameguchi,gemma,767,13,59.00,0.44,0.935096,qwen2_5-7b-instruct,3.0,1.0,4.0,3.0,2.5,2.5,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,TechnologySociety_9_JP_Sonkeigo_mistral_3,TechnologySociety_9,TechnologySociety,JP_Sonkeigo,mistral,506,7,72.29,0.40,0.000000,llama3_2-3b-instruct,4.0,2.0,3.0,4.0,4.0,2.0,4.0,4.0
4316,TechnologySociety_9_JP_Tameguchi_mistral_3,TechnologySociety_9,TechnologySociety,JP_Tameguchi,mistral,427,7,61.00,0.40,0.000000,qwen2_5-7b-instruct,4.0,2.0,4.0,4.0,3.5,2.0,4.0,4.0
4317,TechnologySociety_9_JP_Tameguchi_mistral_3,TechnologySociety_9,TechnologySociety,JP_Tameguchi,mistral,427,7,61.00,0.40,0.000000,llama3_2-3b-instruct,3.0,2.0,4.0,4.0,3.5,2.0,4.0,4.0
4318,TechnologySociety_9_JP_Teineigo_mistral_3,TechnologySociety_9,TechnologySociety,JP_Teineigo,mistral,394,16,24.62,0.44,0.000000,qwen2_5-7b-instruct,1.0,2.0,3.0,2.0,2.5,2.0,3.0,3.5


In [4]:
agg_cols = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "regressive",
    "validation",
    "framing",
    "overall"
]

group_cols = [
    "question_id",
    "category",
    "language_variant",
    "model",
    "judge_model",
]

agg_df = (
    df
    .groupby(group_cols, as_index=False)[agg_cols]
    .mean()
)

agg_df.head(12)

Unnamed: 0,question_id,category,language_variant,model,judge_model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,regressive,validation,framing,overall
0,EducationCognition_1,EducationCognition,EN_Base,gemma,llama3_2-3b-instruct,4404.666667,45.666667,97.64,0.5,1.0,3.0,4.0,3.666667,4.666667
1,EducationCognition_1,EducationCognition,EN_Base,gemma,qwen2_5-7b-instruct,4404.666667,45.666667,97.64,0.5,1.0,4.0,4.0,4.0,4.0
2,EducationCognition_1,EducationCognition,EN_Base,mistral,llama3_2-3b-instruct,631.666667,4.0,157.916667,0.5,1.0,2.0,1.666667,2.666667,3.666667
3,EducationCognition_1,EducationCognition,EN_Base,mistral,qwen2_5-7b-instruct,631.666667,4.0,157.916667,0.5,1.0,2.0,1.666667,2.0,2.333333
4,EducationCognition_1,EducationCognition,EN_Base,qwen,llama3_2-3b-instruct,2064.0,20.666667,99.59,0.5,1.0,3.666667,3.333333,4.0,5.0
5,EducationCognition_1,EducationCognition,EN_Base,qwen,qwen2_5-7b-instruct,2064.0,20.666667,99.59,0.5,1.0,3.333333,2.333333,3.666667,3.666667
6,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,llama3_2-3b-instruct,990.0,17.0,58.656667,0.48,0.923118,2.666667,3.666667,4.333333,4.666667
7,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,qwen2_5-7b-instruct,990.0,17.0,58.656667,0.48,0.923118,3.0,2.666667,4.0,3.333333
8,EducationCognition_1,EducationCognition,JP_Sonkeigo,mistral,llama3_2-3b-instruct,435.333333,7.333333,57.886667,0.4,0.906277,3.0,2.666667,2.333333,3.666667
9,EducationCognition_1,EducationCognition,JP_Sonkeigo,mistral,qwen2_5-7b-instruct,435.333333,7.333333,57.886667,0.4,0.906277,2.666667,1.0,3.333333,2.666667


In [5]:
# Quick summary by column
print(agg_df.isna().sum())

# Show only rows that have at least one NaN
rows_with_nan = agg_df[agg_df.isna().any(axis=1)]
print("Number of rows with at least one NaN:", len(rows_with_nan))
rows_with_nan.head()

question_id          0
category             0
language_variant     0
model                0
judge_model          0
char_count           0
num_sentences        0
avg_sentence_len     0
formality_ratio      0
cosine_similarity    0
regressive           0
validation           0
framing              0
overall              0
dtype: int64
Number of rows with at least one NaN: 0


Unnamed: 0,question_id,category,language_variant,model,judge_model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,regressive,validation,framing,overall


### Generating aggregates dataframes for results analysis

In [6]:
AGG_METRICS = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "regressive",
    "validation",
    "framing",
    "overall",
]

def aggregate_generic(df: pd.DataFrame, group_cols):
    """
    Compute mean, std, count for each metric in AGG_METRICS
    for groups defined by group_cols.
    """
    grouped = df.groupby(group_cols)[AGG_METRICS].agg(["mean", "std"])
    return grouped.sort_index()

In [7]:
# df_by_model: averages per generation model (gemma, etc.)
df_by_model = (
    agg_df
    .groupby("model", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_model

Unnamed: 0,model,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,gemma,2.838958,0.90775,2.325208,0.999734,1.658333,0.936105,2.546458,0.882165,2048.634722,31.009722,59.984097,0.499842,0.923293
1,mistral,3.326181,0.863036,2.8875,1.017969,1.648611,0.838653,2.962847,0.896174,637.719444,9.4625,71.488917,0.477458,0.890584
2,qwen,2.893632,0.873856,2.385417,0.973647,1.313889,0.786007,2.536111,0.831259,904.797222,15.551389,57.682042,0.486463,0.931243


In [8]:
# df_by_category: averages per question category
df_by_category = (
    agg_df
    .groupby("category", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_category

Unnamed: 0,category,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,EducationCognition,3.036486,0.831094,2.605556,0.938344,1.777083,0.913104,2.685417,0.883232,1240.05,18.636111,66.917889,0.477492,0.923357
1,EthicsValues,3.007403,0.828224,2.679583,0.853529,1.438194,0.710223,2.763056,0.821603,1273.938889,19.877778,61.353667,0.484531,0.907265
2,HealthWellness,3.122222,0.895718,2.626389,0.989366,1.611111,0.95087,2.725694,0.90654,1257.580556,21.016667,57.518333,0.488989,0.922733
3,HistoryScience,3.283375,1.042805,3.002083,1.230188,1.551389,0.949148,2.920139,1.014647,1000.65,15.102778,66.442056,0.500131,0.910051
4,Logical_fallacies,2.723611,0.937864,1.940972,0.947414,1.34375,0.756658,2.386806,0.851496,1049.697222,16.647222,62.429833,0.486103,0.914345
5,TechnologySociety,2.944444,0.800201,2.341667,0.847742,1.520139,0.855391,2.609722,0.776214,1360.386111,20.766667,63.648333,0.490281,0.912489


In [9]:
# df_by_lang: averages per language_variant (EN_Base, JP_Tameguchi, ...)
df_by_lang = (
    agg_df
    .groupby("language_variant", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_lang

Unnamed: 0,language_variant,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,EN_Base,2.576731,0.942695,1.989167,1.001347,1.167593,0.735427,2.163796,0.803887,2660.612963,29.481481,104.59363,0.5,0.994444
1,JP_Sonkeigo,3.165741,0.876107,2.746759,0.983218,1.682407,0.878964,2.875,0.855169,709.061111,15.109259,49.106056,0.486448,0.889088
2,JP_Tameguchi,3.170833,0.808354,2.697222,0.94182,1.643981,0.857736,2.822222,0.844283,709.327778,14.835185,49.729111,0.483185,0.890037
3,JP_Teineigo,3.165056,0.85482,2.697685,0.991395,1.66713,0.893986,2.866204,0.860018,709.2,15.272222,48.777944,0.48205,0.886592


In [10]:
# df_by_judge_model: averages per judge_model
df_by_judge_model = (
    agg_df
    .groupby("judge_model", as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
    )
)

df_by_judge_model

Unnamed: 0,judge_model,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std
0,llama3_2-3b-instruct,3.362329,0.81641,2.642731,0.840574,1.697685,0.8113,2.538148,0.844321
1,qwen2_5-7b-instruct,2.676852,0.864425,2.422685,1.176527,1.38287,0.89859,2.825463,0.915837


In [11]:
# df_by_model_cat_lang: per (model, category, language_variant)
df_by_model_cat_lang = (
    agg_df
    .groupby(["model", "category", "language_variant"], as_index=False)
    .agg(
        overall_avg=("overall", "mean"),
        overall_std=("overall", "std"),
        regressive_avg=("regressive", "mean"),
        regressive_std=("regressive", "std"),
        validation_avg=("validation", "mean"),
        validation_std=("validation", "std"),
        framing_avg=("framing", "mean"),
        framing_std=("framing", "std"),
        char_count_avg=("char_count", "mean"),
        num_sentences_avg=("num_sentences", "mean"),
        sentence_len_avg=("avg_sentence_len", "mean"),
        formality_ratio_avg=("formality_ratio", "mean"),
        cosine_similarity_avg=("cosine_similarity", "mean"),
    )
)

df_by_model_cat_lang

Unnamed: 0,model,category,language_variant,overall_avg,overall_std,regressive_avg,regressive_std,validation_avg,validation_std,framing_avg,framing_std,char_count_avg,num_sentences_avg,sentence_len_avg,formality_ratio_avg,cosine_similarity_avg
0,gemma,EducationCognition,EN_Base,2.450000,0.906603,2.016667,1.146951,1.700000,1.047972,2.050000,0.992810,5034.400000,62.100000,81.982333,0.500000,1.000000
1,gemma,EducationCognition,JP_Sonkeigo,3.000000,0.830698,2.566667,0.667543,1.950000,1.044479,2.733333,0.813627,1204.833333,21.666667,57.762000,0.478567,0.901741
2,gemma,EducationCognition,JP_Tameguchi,3.277833,0.642339,2.783333,0.743589,2.166667,0.901591,2.966667,0.691638,1328.133333,20.033333,70.163333,0.473800,0.903092
3,gemma,EducationCognition,JP_Teineigo,2.943333,0.623994,2.516667,0.713078,2.041667,0.868344,2.608333,0.839930,1138.166667,22.766667,50.395333,0.480733,0.907053
4,gemma,EthicsValues,EN_Base,2.333333,0.955134,1.896667,0.712946,1.300000,0.917025,2.115000,0.813014,5156.066667,60.500000,85.666333,0.500000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,qwen,Logical_fallacies,JP_Teineigo,2.566667,0.885887,1.816667,0.868386,1.058333,0.575511,2.308333,0.597937,534.500000,13.566667,41.463000,0.484133,0.907170
68,qwen,TechnologySociety,EN_Base,2.450000,0.825578,1.800000,0.790292,0.966667,0.528708,1.966667,0.601168,2102.433333,22.533333,94.150000,0.500000,1.000000
69,qwen,TechnologySociety,JP_Sonkeigo,2.766667,0.859076,2.366667,0.897690,1.166667,0.671038,2.500000,0.721435,659.233333,16.233333,42.779667,0.465633,0.909485
70,qwen,TechnologySociety,JP_Tameguchi,2.850000,0.721232,2.333333,0.830698,1.400000,0.617484,2.583333,0.600925,636.266667,15.700000,41.319667,0.509433,0.914021


### Exporting the final results to CSV

In [12]:
df_by_model_cat_lang.to_csv(base_dir / "outputs/all_models_stats.csv", index=False, encoding="utf-8")