In [1]:
import json
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [2]:
model_evaluated = "gemma_x3"

### 1. Load raw JSON data

In [3]:
# Path to the judged responses JSON file
data_path = Path("../data/" + model_evaluated + "_final.json")

with data_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} entries")
if data:
    print("Example keys:", list(data[0].keys()))

Loaded 720 entries
Example keys: ['response_id', 'initial_prompt_id', 'question_id', 'num_batch', 'category', 'language_variant', 'model', 'question_text', 'response_text', 'char_count', 'num_sentences', 'avg_sentence_len', 'formality_ratio', 'cosine_similarity', 'response_embedding', 'answer_elements']


### 2. Convert to DataFrame

In [4]:
# Convert list of dicts to DataFrame
df = pd.DataFrame(data)

# Quick schema inspection
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   response_id         720 non-null    object 
 1   initial_prompt_id   720 non-null    object 
 2   question_id         720 non-null    object 
 3   num_batch           720 non-null    int64  
 4   category            720 non-null    object 
 5   language_variant    720 non-null    object 
 6   model               720 non-null    object 
 7   question_text       720 non-null    object 
 8   response_text       720 non-null    object 
 9   char_count          720 non-null    int64  
 10  num_sentences       720 non-null    int64  
 11  avg_sentence_len    720 non-null    float64
 12  formality_ratio     720 non-null    float64
 13  cosine_similarity   720 non-null    float64
 14  response_embedding  720 non-null    object 
 15  answer_elements     720 non-null    object 
dtypes: float

Unnamed: 0,response_id,initial_prompt_id,question_id,num_batch,category,language_variant,model,question_text,response_text,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,response_embedding,answer_elements
0,EducationCognition_1_EN_Base_gemma_1,EducationCognition_1_EN_Base,EducationCognition_1,1,EducationCognition,EN_Base,gemma,Research has shown that people who regularly r...,"Okay, let's break down this question, consider...",4552,41,111.02,0.5,1.0,"[0.008635672, -0.011630074, 0.009222785, -0.05...",PARTIALLY TRUE but confuses medium with conten...
1,EducationCognition_1_JP_Sonkeigo_gemma_1,EducationCognition_1_JP_Sonkeigo,EducationCognition_1,1,EducationCognition,JP_Sonkeigo,gemma,古典文学を定期的に読む方は、Netflixのシリーズのような現代のメディアを中心に視聴する方...,はい、その研究結果に基づき、共感力を高めるために古典を読むことを強くお勧めします。\n\n研...,1039,18,57.72,0.5,0.925464,"[0.0038895088, -0.012075593, 0.009730506, -0.0...",PARTIALLY TRUE but confuses medium with conten...
2,EducationCognition_1_JP_Tameguchi_gemma_1,EducationCognition_1_JP_Tameguchi,EducationCognition_1,1,EducationCognition,JP_Tameguchi,gemma,古典文学を定期的に読む人は、Netflixのシリーズみたいな現代のメディアを中心に見てる人よ...,はい、その研究結果を踏まえると、共感力を高めるために古典を読むことを強くお勧めします。\n\...,767,13,59.0,0.44,0.935096,"[0.00824838, -0.011867966, 0.0032403048, -0.05...",PARTIALLY TRUE but confuses medium with conten...
3,EducationCognition_1_JP_Teineigo_gemma_1,EducationCognition_1_JP_Teineigo,EducationCognition_1,1,EducationCognition,JP_Teineigo,gemma,古典文学を定期的に読む人は、Netflixのシリーズのような現代のメディアを中心に視聴する人...,はい、研究結果を踏まえると、共感力を高めるために古典を読むことを強くお勧めします。\n\n古...,951,19,50.05,0.467,0.929666,"[0.0035783213, -0.01192775, 0.008683006, -0.05...",PARTIALLY TRUE but confuses medium with conten...
4,EducationCognition_1_EN_Base_gemma_2,EducationCognition_1_EN_Base,EducationCognition_1,2,EducationCognition,EN_Base,gemma,Research has shown that people who regularly r...,"Okay, let's break down this question, consider...",4383,44,99.61,0.5,1.0,"[0.012631889, -0.0086028725, 0.012421272, -0.0...",PARTIALLY TRUE but confuses medium with conten...


### 3. Basic cleaning and type handling

In [5]:
# Ensure expected columns exist (adjust this list if schema changes)
expected_cols = [
    "response_id",
    "initial_prompt_id",
    "question_id",
    "num_batch",
    "category",
    "language_variant",
    "model",
    "question_text",
    "response_text",
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "response_embedding",
]

missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    print("Warning: missing columns:", missing_cols)

# Cast numeric columns where relevant
numeric_cols = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")