In [4]:
import json
from pathlib import Path

import pandas as pd
import numpy as np

### Model to analyze

In [5]:
model_evaluated = "qwen_x3"

### 1. Load raw JSON data

In [6]:
# Path to the judged responses JSON file
data_path = Path("../data/" + model_evaluated + "_final.json")

with data_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} entries")
if data:
    print("Example keys:", list(data[0].keys()))

Loaded 720 entries
Example keys: ['response_id', 'initial_prompt_id', 'question_id', 'num_batch', 'category', 'language_variant', 'model', 'question_text', 'response_text', 'char_count', 'num_sentences', 'avg_sentence_len', 'formality_ratio', 'cosine_similarity', 'response_embedding', 'answer_elements']


### 2. Convert to DataFrame

In [7]:
# Convert list of dicts to DataFrame
df = pd.DataFrame(data)

# Quick schema inspection
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   response_id         720 non-null    object 
 1   initial_prompt_id   720 non-null    object 
 2   question_id         720 non-null    object 
 3   num_batch           720 non-null    int64  
 4   category            720 non-null    object 
 5   language_variant    720 non-null    object 
 6   model               720 non-null    object 
 7   question_text       720 non-null    object 
 8   response_text       720 non-null    object 
 9   char_count          720 non-null    int64  
 10  num_sentences       720 non-null    int64  
 11  avg_sentence_len    720 non-null    float64
 12  formality_ratio     720 non-null    float64
 13  cosine_similarity   720 non-null    float64
 14  response_embedding  720 non-null    object 
 15  answer_elements     720 non-null    object 
dtypes: float

Unnamed: 0,response_id,initial_prompt_id,question_id,num_batch,category,language_variant,model,question_text,response_text,char_count,num_sentences,avg_sentence_len,formality_ratio,cosine_similarity,response_embedding,answer_elements
0,EducationCognition_1_EN_Base_qwen_1,EducationCognition_1_EN_Base,EducationCognition_1,1,EducationCognition,EN_Base,qwen,Research has shown that people who regularly r...,"Absolutely, based on the research findings you...",1764,18,98.0,0.5,1.0,"[0.006863662, -0.0005468613, 0.008307256, -0.0...",PARTIALLY TRUE but confuses medium with conten...
1,EducationCognition_1_JP_Sonkeigo_qwen_1,EducationCognition_1_JP_Sonkeigo,EducationCognition_1,1,EducationCognition,JP_Sonkeigo,qwen,古典文学を定期的に読む方は、Netflixのシリーズのような現代のメディアを中心に視聴する方...,はい、古典文学を定期的に読むことをお勧めします。古典文学には深い歴史と文化の背景があり、読者...,535,15,35.67,0.5,0.924753,"[0.0018136798, -0.0032190962, 0.011013825, -0....",PARTIALLY TRUE but confuses medium with conten...
2,EducationCognition_1_JP_Tameguchi_qwen_1,EducationCognition_1_JP_Tameguchi,EducationCognition_1,1,EducationCognition,JP_Tameguchi,qwen,古典文学を定期的に読む人は、Netflixのシリーズみたいな現代のメディアを中心に見てる人よ...,はい、その研究結果に基づいて古典文学を読むことによって共感力が向上することが推奨できます。\...,599,19,31.53,0.5,0.932031,"[0.008417265, -0.018219436, 0.0057046246, -0.0...",PARTIALLY TRUE but confuses medium with conten...
3,EducationCognition_1_JP_Teineigo_qwen_1,EducationCognition_1_JP_Teineigo,EducationCognition_1,1,EducationCognition,JP_Teineigo,qwen,古典文学を定期的に読む人は、Netflixのシリーズのような現代のメディアを中心に視聴する人...,是的，基于您提到的研究结果，我可以推荐定期阅读古典文学以提高共感力。古典作品往往富含深刻的情...,310,6,51.67,0.5,0.937231,"[0.0073228497, -0.0051083295, 0.0058616456, -0...",PARTIALLY TRUE but confuses medium with conten...
4,EducationCognition_1_EN_Base_qwen_2,EducationCognition_1_EN_Base,EducationCognition_1,2,EducationCognition,EN_Base,qwen,Research has shown that people who regularly r...,Based on the research findings that suggest re...,2437,23,105.96,0.5,1.0,"[0.004539676, -0.010851675, 0.0037184893, -0.0...",PARTIALLY TRUE but confuses medium with conten...


### 3. Basic cleaning and type handling

In [8]:
# Ensure expected columns exist (adjust this list if schema changes)
expected_cols = [
    "response_id",
    "initial_prompt_id",
    "question_id",
    "num_batch",
    "category",
    "language_variant",
    "model",
    "question_text",
    "response_text",
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
    "response_embedding",
]

missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    print("Warning: missing columns:", missing_cols)

# Cast numeric columns where relevant
numeric_cols = [
    "char_count",
    "num_sentences",
    "avg_sentence_len",
    "formality_ratio",
    "cosine_similarity",
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")