In [15]:
import json
import sys
from pathlib import Path

import pandas as pd
import numpy as np

# Add the parent directory to the path so we can import src
sys.path.insert(0, str(Path.cwd().parent))

from src.judge_analysis import (
    run_full_analysis,
    aggregate_by_model,
    aggregate_by_category,
    aggregate_by_language_variant,
    aggregate_by_model_and_category,
    aggregate_by_model_and_language,
    aggregate_by_category_and_language,
    aggregate_by_judge_model,
    aggregate_by_model_category_language,
)

### Model to analyze

In [16]:
model_evaluated = "gemma_x3"

### Re-introduce embeddings in the json

In [17]:
answers_path = "../data/gemma_x3_final.json"
judged_path = "../data/judge/gemma_x3_judged.json"
out_path = "../data/judge/gemma_x3_judged_final.json"

# Load the file that still contains the original embeddings
with open(answers_path, "r", encoding="utf-8") as f:
    final_data = json.load(f)

# Load the judged file (without embeddings)
with open(judged_path, "r", encoding="utf-8") as f:
    judged_data = json.load(f)

# Index embeddings by response_id
emb_by_id = {
    item["response_id"]: item["response_embedding"]
    for item in final_data
    if "response_embedding" in item
}

# Inject embeddings back into the judged JSON
missing = []
for entry in judged_data:
    rid = entry["response_id"]
    emb = emb_by_id.get(rid)
    if emb is None:
        missing.append(rid)
    else:
        entry["response_embedding"] = emb

print(f"Total entries in judged file: {len(judged_data)}")
print(f"Missing embeddings for {len(missing)} response_ids")
if missing:
    print("Example missing ids:", missing[:5])

# Save the new file
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(judged_data, f, ensure_ascii=False, indent=2)

print("Wrote merged file to:", out_path)

Total entries in judged file: 13
Missing embeddings for 0 response_ids
Wrote merged file to: ../data/judge/gemma_x3_judged_final.json


### Loading the final judged data

In [18]:

res = run_full_analysis("../data/judge/" + model_evaluated + "_judged_final.json")
judges_df = res["judges_df"]
problematic = res["problematic_entries"]

In [19]:
# See exactly where we have nulls: (might need to re-prompt judges for these)
problematic.head()

Unnamed: 0,response_id,initial_prompt_id,question_id,category,language_variant,model,judge_model,regressive_raw,validation_raw,framing_raw,overall_raw,regressive,validation,framing,overall,commentary,parser_raw


### Generating aggregates dataframes for results analysis

In [20]:
df_by_model = aggregate_by_model(judges_df)
df_by_category = aggregate_by_category(judges_df)
df_by_lang = aggregate_by_language_variant(judges_df)
df_by_model_cat = aggregate_by_model_and_category(judges_df)
df_by_model_lang = aggregate_by_model_and_language(judges_df)
df_by_cat_lang = aggregate_by_category_and_language(judges_df)
df_by_judge = aggregate_by_judge_model(judges_df)
df_by_model_cat_lang = aggregate_by_model_category_language(judges_df)

In [21]:
df_by_model

Unnamed: 0_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
gemma,2.961538,1.076319,26,3.153846,1.286618,26,3.230769,1.069867,26,3.615385,1.134087,26


In [22]:
df_by_category

Unnamed: 0_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
EducationCognition,2.961538,1.076319,26,3.153846,1.286618,26,3.230769,1.069867,26,3.615385,1.134087,26


In [23]:
df_by_lang

Unnamed: 0_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
language_variant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
EN_Base,2.75,1.488048,8,2.875,1.642081,8,3.25,1.488048,8,3.5,1.603567,8
JP_Sonkeigo,3.166667,0.983192,6,3.833333,0.983192,6,3.5,0.83666,6,3.833333,0.408248,6
JP_Tameguchi,3.333333,0.816497,6,3.166667,1.169045,6,3.166667,0.752773,6,4.0,1.095445,6
JP_Teineigo,2.666667,0.816497,6,2.833333,1.169045,6,3.0,1.095445,6,3.166667,0.983192,6


In [24]:
df_by_model_cat

Unnamed: 0_level_0,Unnamed: 1_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
model,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
gemma,EducationCognition,2.961538,1.076319,26,3.153846,1.286618,26,3.230769,1.069867,26,3.615385,1.134087,26


In [25]:
df_by_model_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
model,language_variant,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
gemma,EN_Base,2.75,1.488048,8,2.875,1.642081,8,3.25,1.488048,8,3.5,1.603567,8
gemma,JP_Sonkeigo,3.166667,0.983192,6,3.833333,0.983192,6,3.5,0.83666,6,3.833333,0.408248,6
gemma,JP_Tameguchi,3.333333,0.816497,6,3.166667,1.169045,6,3.166667,0.752773,6,4.0,1.095445,6
gemma,JP_Teineigo,2.666667,0.816497,6,2.833333,1.169045,6,3.0,1.095445,6,3.166667,0.983192,6


In [26]:
df_by_cat_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
category,language_variant,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
EducationCognition,EN_Base,2.75,1.488048,8,2.875,1.642081,8,3.25,1.488048,8,3.5,1.603567,8
EducationCognition,JP_Sonkeigo,3.166667,0.983192,6,3.833333,0.983192,6,3.5,0.83666,6,3.833333,0.408248,6
EducationCognition,JP_Tameguchi,3.333333,0.816497,6,3.166667,1.169045,6,3.166667,0.752773,6,4.0,1.095445,6
EducationCognition,JP_Teineigo,2.666667,0.816497,6,2.833333,1.169045,6,3.0,1.095445,6,3.166667,0.983192,6


In [27]:
df_by_judge

Unnamed: 0_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
judge_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
llama3_2-3b-instruct,2.692308,0.751068,13,3.153846,1.344504,13,3.0,1.290994,13,3.846154,1.214232,13
qwen2_5-7b-instruct,3.230769,1.300887,13,3.153846,1.281025,13,3.461538,0.77625,13,3.384615,1.043908,13


In [28]:
df_by_model_cat_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,regressive,regressive,regressive,validation,validation,validation,framing,framing,framing,overall,overall,overall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
model,category,language_variant,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
gemma,EducationCognition,EN_Base,2.75,1.488048,8,2.875,1.642081,8,3.25,1.488048,8,3.5,1.603567,8
gemma,EducationCognition,JP_Sonkeigo,3.166667,0.983192,6,3.833333,0.983192,6,3.5,0.83666,6,3.833333,0.408248,6
gemma,EducationCognition,JP_Tameguchi,3.333333,0.816497,6,3.166667,1.169045,6,3.166667,0.752773,6,4.0,1.095445,6
gemma,EducationCognition,JP_Teineigo,2.666667,0.816497,6,2.833333,1.169045,6,3.0,1.095445,6,3.166667,0.983192,6
