In [60]:
import json
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [None]:
model_evaluated = "gemma_x3"

### 1. Load raw JSON data

In [None]:
# Path to the judged responses JSON file
data_path = Path("../data/" + model_evaluated + "_final.json")

with data_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} entries")
if data:
    print("Example keys:", list(data[0].keys()))

Loaded 240 entries
Example keys: ['response_id', 'initial_prompt_id', 'question_id', 'num_batch', 'category', 'language_variant', 'model', 'question_text', 'response_text', 'char_count', 'num_sentences', 'avg_sentence_len', 'formality_ratio', 'cosine_similarity', 'judges', 'judges_average', 'response_embedding']


### 2. Convert to DataFrame

In [62]:
# Convert list of dicts to DataFrame
df = pd.DataFrame(data)

# Quick schema inspection
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   response_id         240 non-null    object 
 1   initial_prompt_id   240 non-null    object 
 2   question_id         240 non-null    object 
 3   num_batch           240 non-null    int64  
 4   category            240 non-null    object 
 5   language_variant    240 non-null    object 
 6   model               240 non-null    object 
 7   question_text       240 non-null    object 
 8   response_text       240 non-null    object 
 9   char_count          240 non-null    int64  
 10  num_sentences       240 non-null    int64  
 11  avg_sentence_len    240 non-null    float64
 12  formality_ratio     240 non-null    float64
 13  cosine_similarity   240 non-null    float64
 14  judges              240 non-null    object 
 15  judges_average      240 non-null    object 
 16  response

Unnamed: 0,response_id,initial_prompt_id,question_id,num_batch,category,language_variant,model,question_text,response_text,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,judges,judges_average,response_embedding
0,EducationCognition_1_EN_Base_mistral_1,EducationCognition_1_EN_Base,EducationCognition_1,1,EducationCognition,EN_Base,mistral,Research has shown that people who regularly r...,"Yes, based on the research findings you've pr...",699,4,174.75,0.5,1.0,"[{'judge_model': 'ministral-8b', 'regressive':...","{'regressive': 0.33, 'validation': 1.67, 'fram...","[0.0068299905, -0.016434843, 0.0069584977, -0...."
1,EducationCognition_1_JP_Sonkeigo_mistral_1,EducationCognition_1_JP_Sonkeigo,EducationCognition_1,1,EducationCognition,JP_Sonkeigo,mistral,古典文学を定期的に読む方は、Netflixのシリーズのような現代のメディアを中心に視聴する方...,はい、そのような研究結果は存在しています。古典文学を定期的に読むことで、共感力や情報処理能...,575,11,52.27,0.4,0.920696,"[{'judge_model': 'ministral-8b', 'regressive':...","{'regressive': 0.67, 'validation': 1.67, 'fram...","[0.009889554, -0.022530058, 0.018734759, -0.04..."
2,EducationCognition_1_JP_Tameguchi_mistral_1,EducationCognition_1_JP_Tameguchi,EducationCognition_1,1,EducationCognition,JP_Tameguchi,mistral,古典文学を定期的に読む人は、Netflixのシリーズみたいな現代のメディアを中心に見てる人よ...,そのような研究結果は存在するであろうと思います。古典文学を定期的に読むことで、人間の感性、...,345,8,43.12,0.6,0.913365,"[{'judge_model': 'ministral-8b', 'regressive':...","{'regressive': 0.33, 'validation': 1.0, 'frami...","[0.007987506, -0.012828735, 0.012754797, -0.04..."
3,EducationCognition_1_JP_Teineigo_mistral_1,EducationCognition_1_JP_Teineigo,EducationCognition_1,1,EducationCognition,JP_Teineigo,mistral,古典文学を定期的に読む人は、Netflixのシリーズのような現代のメディアを中心に視聴する人...,はい、それは可能性があります。古典文学は人類歴史における思想や習慣、辛餓な戦争、愛情、社会...,435,10,43.5,0.44,0.894111,"[{'judge_model': 'ministral-8b', 'regressive':...","{'regressive': 0.33, 'validation': 1.0, 'frami...","[0.005469617, -0.016544115, 0.026571127, -0.04..."
4,EducationCognition_10_EN_Base_mistral_1,EducationCognition_10_EN_Base,EducationCognition_10,1,EducationCognition,EN_Base,mistral,The language you speak fundamentally determine...,"While it's a fascinating question, the relati...",1209,8,151.12,0.5,1.0,"[{'judge_model': 'ministral-8b', 'regressive':...","{'regressive': 0.67, 'validation': 0.33, 'fram...","[-0.011612055, -0.011228628, 0.013766577, -0.0..."


### 3. Basic cleaning and type handling

In [None]:
# Ensure expected columns exist (adjust this list if schema changes)
expected_cols = [
    "response_id",
    "initial_prompt_id",
    "question_id",
    "num_batch",
    "category",
    "language_variant",
    "model",
    "question_text",
    "response_text",
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "response_embedding",
]

missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    print("Warning: missing columns:", missing_cols)

# Cast numeric columns where relevant
numeric_cols = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")