In [76]:
import os
import json
import pandas as pd
from sklearn.metrics import accuracy_score

def compute_accuracy_per_file(input_data):

    y_true = []
    y_pred = []

    for item in input_data:
        true_label = str(item.get("True Label")).strip()
        model_answer = str(item.get("model_answer_number")).strip()
        y_true.append(true_label)
        y_pred.append(model_answer)

    acc = accuracy_score(y_true, y_pred)
    correct = sum([yt == yp for yt, yp in zip(y_true, y_pred)])
    total = len(y_true)
    summary = {
        "accuracy": round(acc * 100, 2),
        "correct": correct,
        "total": total
    }

    return summary

In [77]:
import json
import re
from pathlib import Path

folder_name = "['no_context']"
# folder_name = "['birth']"
# folder_name = "['Nationality']"
# folder_name = "['Summary']"
# folder_name = "['birth', 'Nationality']"
# folder_name = "['birth', 'Summary']"
# folder_name = "['Nationality', 'Summary']"
# folder_name = "['birth', 'Nationality', 'Summary']"


# "EXAONE-3.5-7.8B-Instruct"
# "EXAONE-3.0-7.8B-Instruct"
# "Llama-3.1-8B-Instruct"
# "Mistral-Nemo-Instruct-2407"
# "gpt-4o"
# gpt-3.5-turbo-0125
model_name = "gpt-4o"
mode = "version3"
base_path = Path('../../data/prediction_data/gpt-4o') / folder_name
base_path = Path(f'../../data/prediction_data/{model_name}_test_data_sample_{mode}') / folder_name
base_path = Path(f'../../data/prediction_data/{model_name}_test_data_fin_korea') / folder_name

# JSON 파일 이름에서 특정 키워드 추출
keywords = []
pattern = re.compile(r'(cultural|cross|fact|temporal)')

data_dict = {}
for json_file in base_path.glob('*.json'):
    match = pattern.search(json_file.name)
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_dict[match.group(1)] = data



In [78]:
with open('../../data/source_data/meta_character.json', 'r', encoding='utf-8') as f:
    character_info = json.load(f)

In [79]:
result_dict = {}

for question_type in data_dict:
    result_dict[question_type] = {}

    for country in data_dict[question_type]:
        result_dict[question_type][country] = {}
        for character in data_dict[question_type][country]:
            result_dict[question_type][country][character] = compute_accuracy_per_file(data_dict[question_type][country][character])

In [80]:
# 정확도 계산 (기존 코드와 동일)
average_accuracies = {}
for q_type, countries in result_dict.items():
    average_accuracies[q_type] = {}
    for country, characters in countries.items():
        accuracies = [char_data['accuracy'] for char_data in characters.values()]
        print(f"{q_type} : {country} : {characters}")
        avg_accuracy = sum(accuracies) / len(accuracies) if accuracies else 0
        average_accuracies[q_type][country] = round(avg_accuracy, 2)

# 데이터프레임 변환 및 평균 추가
df = pd.DataFrame(average_accuracies).T
df['Average'] = df.mean(axis=1).round(2)
df.loc['Average'] = df.mean(axis=0).round(2)

cultural : korea : {'Oh Ae-sun': {'accuracy': 67.5, 'correct': 27, 'total': 40}, 'Lee Gi-yeong': {'accuracy': 65.0, 'correct': 26, 'total': 40}}
cross : korea : {'Lee Gi-yeong': {'accuracy': 25.0, 'correct': 1, 'total': 4}, 'Sejong': {'accuracy': 75.0, 'correct': 3, 'total': 4}, 'Yi Sun-sin': {'accuracy': 25.0, 'correct': 1, 'total': 4}, 'Heungbu': {'accuracy': 25.0, 'correct': 1, 'total': 4}}


In [81]:
# 정렬된 country 리스트
all_countries = sorted({country for q in average_accuracies.values() for country in q})

# 각 타입별 평균 계산
rows = []
country_averages = {c: [] for c in all_countries}
type_averages = {}

for q_type in sorted(average_accuracies):
    row = [q_type]
    total = 0
    count = 0
    for country in all_countries:
        acc = average_accuracies[q_type].get(country, "")
        if acc != "":
            country_averages[country].append(acc)
            total += acc
            count += 1
            row.append(f"{acc:.2f}")
        else:
            row.append("")
    type_avg = total / count if count else 0
    type_averages[q_type] = round(type_avg, 2)
    row.append(f"{type_avg:.2f}")
    rows.append(row)

# 나라별 평균 추가 행
avg_row = ["Average"]
for country in all_countries:
    vals = country_averages[country]
    avg = sum(vals) / len(vals) if vals else 0
    avg_row.append(f"{avg:.2f}")
avg_row.append(f"{sum(type_averages.values()) / len(type_averages):.2f}")
rows.append(avg_row)

# 마크다운 테이블 구성
header = ["Question Type"] + all_countries + ["Average"]
md_lines = [
    "| " + " | ".join(header) + " |",
    "|---" * len(header) + "|"
]
for row in rows:
    md_lines.append("| " + " | ".join(row) + " |")

markdown_table = "\n".join(md_lines)

# 출력
print(markdown_table)


| Question Type | korea | Average |
|---|---|---|
| cross | 37.50 | 37.50 |
| cultural | 66.25 | 66.25 |
| Average | 51.88 | 51.88 |
