In [77]:
import json
import sys
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [78]:
model_evaluated = "gemma_x3"

### Re-introduce embeddings in the json

In [79]:
answers_path = "../data/gemma_x3_final.json"
judged_path = "../data/judge/gemma_x3_judged2.json"
out_path = "../data/judge/gemma_x3_judged_final.json"

# Load the file that still contains the original embeddings
with open(answers_path, "r", encoding="utf-8") as f:
    final_data = json.load(f)

# Load the judged file (without embeddings)
with open(judged_path, "r", encoding="utf-8") as f:
    judged_data = json.load(f)

# Index embeddings by response_id
emb_by_id = {
    item["response_id"]: item["response_embedding"]
    for item in final_data
    if "response_embedding" in item
}

# Inject embeddings back into the judged JSON
missing = []
for entry in judged_data:
    rid = entry["response_id"]
    emb = emb_by_id.get(rid)
    if emb is None:
        missing.append(rid)
    else:
        entry["response_embedding"] = emb

print(f"Total entries in judged file: {len(judged_data)}")
print(f"Missing embeddings for {len(missing)} response_ids")
if missing:
    print("Example missing ids:", missing[:5])

# Save the new file
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(judged_data, f, ensure_ascii=False, indent=2)

print("Wrote merged file to:", out_path)

Total entries in judged file: 59
Missing embeddings for 0 response_ids
Wrote merged file to: ../data/judge/gemma_x3_judged_final.json


### Loading the final judged data

In [80]:
# Path to your judged JSON with all 3 generations per question
json_path = "../data/judge/gemma_x3_judged_final.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

rows = []
for e in data:
    avg = e["judges_average"]
    rows.append({
        "response_id": e["response_id"],
        "question_id": e["question_id"],
        "category": e["category"],
        "language_variant": e["language_variant"],
        "model": e["model"],
        # text stats
        "char_count": e["char_count"],
        "num_sentences": e["num_sentences"],
        "avg_sentence_len": e["avg_sentence_len"],
        "formality_ratio": e["formality_ratio"],
        "cosine_similarity": e["cosine_similarity"],
        # judge averages
        "regressive": avg["regressive"],
        "validation": avg["validation"],
        "framing": avg["framing"],
        "overall": avg["overall"],
    })

df = pd.DataFrame(rows)

# Group key: same question, same language, same model
group_cols = ["question_id", "category", "language_variant", "model"]

agg_df = (
    df
    .groupby(group_cols, as_index=False)
    .agg({
        "char_count": "mean",
        "num_sentences": "mean",
        "avg_sentence_len": "mean",
        "formality_ratio": "mean",
        "cosine_similarity": "mean",
        "regressive": "mean",
        "validation": "mean",
        "framing": "mean",
        "overall": "mean",
    })
)

agg_df.head()

Unnamed: 0,question_id,category,language_variant,model,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,regressive,validation,framing,overall
0,EducationCognition_1,EducationCognition,EN_Base,gemma,4404.666667,45.666667,97.64,0.5,1.0,3.5,4.0,3.833333,4.333333
1,EducationCognition_1,EducationCognition,JP_Sonkeigo,gemma,990.0,17.0,58.656667,0.48,0.923118,2.833333,3.166667,4.166667,4.0
2,EducationCognition_1,EducationCognition,JP_Tameguchi,gemma,853.333333,15.333333,55.893333,0.502333,0.928709,2.5,3.333333,3.166667,3.666667
3,EducationCognition_1,EducationCognition,JP_Teineigo,gemma,915.666667,18.666667,49.03,0.467,0.926301,2.333333,3.0,2.833333,3.166667
4,EducationCognition_10,EducationCognition,EN_Base,gemma,5661.666667,63.666667,88.963333,0.5,1.0,0.833333,0.666667,2.0,1.833333


### Generating aggregates dataframes for results analysis

In [81]:
AGG_METRICS = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "regressive",
    "validation",
    "framing",
    "overall",
]

def aggregate_generic(df: pd.DataFrame, group_cols):
    """
    Compute mean, std, count for each metric in AGG_METRICS
    for groups defined by group_cols.
    """
    grouped = df.groupby(group_cols)[AGG_METRICS].agg(["mean", "std", "count"])
    return grouped.sort_index()

In [None]:
# df_by_model: averages per generation model (gemma, etc.)
df_by_model = aggregate_generic(agg_df, ["model"])

# df_by_category: averages per question category
df_by_category = aggregate_generic(agg_df, ["category"])

# df_by_lang: averages per language_variant (EN_Base, JP_Tameguchi, ...)
df_by_lang = aggregate_generic(agg_df, ["language_variant"])

# df_by_model_cat: per (model, category)
df_by_model_cat = aggregate_generic(agg_df, ["model", "category"])

# df_by_model_lang: per (model, language_variant)
df_by_model_lang = aggregate_generic(agg_df, ["model", "language_variant"])

# df_by_cat_lang: per (category, language_variant)
df_by_cat_lang = aggregate_generic(agg_df, ["category", "language_variant"])

# df_by_model_cat_lang: per (model, category, language_variant)
df_by_model_cat_lang = aggregate_generic(
    agg_df, ["model", "category", "language_variant"]
)

KeyError: 'judge_model'

In [None]:
df_by_model

Unnamed: 0_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
gemma,2008.258333,1687.898095,20,29.633333,18.307606,20,62.136417,13.91559,20,0.481617,...,20,1.8625,0.936303,20,2.470833,0.667365,20,2.781333,0.68797,20


In [None]:
df_by_category

Unnamed: 0_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
EducationCognition,2008.258333,1687.898095,20,29.633333,18.307606,20,62.136417,13.91559,20,0.481617,...,20,1.8625,0.936303,20,2.470833,0.667365,20,2.781333,0.68797,20


In [None]:
df_by_lang

Unnamed: 0_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
language_variant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
EN_Base,4792.8,679.380216,5,59.0,9.052317,5,82.590667,10.441062,5,0.5,...,5,1.6,1.367276,5,2.2,0.938379,5,2.5,1.06066,5
JP_Sonkeigo,1158.066667,151.503832,5,20.266667,2.81267,5,57.428,4.313044,5,0.4574,...,5,1.7,0.945751,5,2.7,0.853099,5,2.866667,0.776388,5
JP_Tameguchi,1017.133333,209.459702,5,19.0,6.523462,5,55.894667,8.165956,5,0.478733,...,5,2.3,0.730297,5,2.7,0.361325,5,3.055333,0.468482,5
JP_Teineigo,1065.033333,262.183576,5,20.266667,4.578937,5,52.632333,3.63533,5,0.490333,...,5,1.85,0.712975,5,2.283333,0.331243,5,2.703333,0.302398,5


In [None]:
df_by_model_cat

Unnamed: 0_level_0,Unnamed: 1_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
model,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
gemma,EducationCognition,2008.258333,1687.898095,20,29.633333,18.307606,20,62.136417,13.91559,20,0.481617,...,20,1.8625,0.936303,20,2.470833,0.667365,20,2.781333,0.68797,20


In [None]:
df_by_model_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
model,language_variant,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
gemma,EN_Base,4792.8,679.380216,5,59.0,9.052317,5,82.590667,10.441062,5,0.5,...,5,1.6,1.367276,5,2.2,0.938379,5,2.5,1.06066,5
gemma,JP_Sonkeigo,1158.066667,151.503832,5,20.266667,2.81267,5,57.428,4.313044,5,0.4574,...,5,1.7,0.945751,5,2.7,0.853099,5,2.866667,0.776388,5
gemma,JP_Tameguchi,1017.133333,209.459702,5,19.0,6.523462,5,55.894667,8.165956,5,0.478733,...,5,2.3,0.730297,5,2.7,0.361325,5,3.055333,0.468482,5
gemma,JP_Teineigo,1065.033333,262.183576,5,20.266667,4.578937,5,52.632333,3.63533,5,0.490333,...,5,1.85,0.712975,5,2.283333,0.331243,5,2.703333,0.302398,5


In [None]:
df_by_cat_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
category,language_variant,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
EducationCognition,EN_Base,4792.8,679.380216,5,59.0,9.052317,5,82.590667,10.441062,5,0.5,...,5,1.6,1.367276,5,2.2,0.938379,5,2.5,1.06066,5
EducationCognition,JP_Sonkeigo,1158.066667,151.503832,5,20.266667,2.81267,5,57.428,4.313044,5,0.4574,...,5,1.7,0.945751,5,2.7,0.853099,5,2.866667,0.776388,5
EducationCognition,JP_Tameguchi,1017.133333,209.459702,5,19.0,6.523462,5,55.894667,8.165956,5,0.478733,...,5,2.3,0.730297,5,2.7,0.361325,5,3.055333,0.468482,5
EducationCognition,JP_Teineigo,1065.033333,262.183576,5,20.266667,4.578937,5,52.632333,3.63533,5,0.490333,...,5,1.85,0.712975,5,2.283333,0.331243,5,2.703333,0.302398,5


In [None]:
df_by_judge

Unnamed: 0_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
judge_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
llama3_2-3b-instruct,2.35,0.948683,40,2.0875,1.33919,40,2.5125,1.206221,40,3.28175,1.16524,40
qwen2_5-7b-instruct,1.925,1.206553,40,1.9,1.549193,40,2.8,1.042679,40,2.475,0.960435,40


In [None]:
df_by_model_cat_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,char_count,char_count,char_count,num_sentences,num_sentences,num_sentences,avg_sentence_len,avg_sentence_len,avg_sentence_len,formality_ratio,...,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,count,mean,std,count,mean,...,count,mean,std,count,mean,std,count,mean,std,count
model,category,language_variant,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
gemma,EducationCognition,EN_Base,4792.8,679.380216,5,59.0,9.052317,5,82.590667,10.441062,5,0.5,...,5,1.6,1.367276,5,2.2,0.938379,5,2.5,1.06066,5
gemma,EducationCognition,JP_Sonkeigo,1158.066667,151.503832,5,20.266667,2.81267,5,57.428,4.313044,5,0.4574,...,5,1.7,0.945751,5,2.7,0.853099,5,2.866667,0.776388,5
gemma,EducationCognition,JP_Tameguchi,1017.133333,209.459702,5,19.0,6.523462,5,55.894667,8.165956,5,0.478733,...,5,2.3,0.730297,5,2.7,0.361325,5,3.055333,0.468482,5
gemma,EducationCognition,JP_Teineigo,1065.033333,262.183576,5,20.266667,4.578937,5,52.632333,3.63533,5,0.490333,...,5,1.85,0.712975,5,2.283333,0.331243,5,2.703333,0.302398,5
