# Validation of the gender representation bias quantification method

Validate the LLM-based gender representation bias quantification method on an annotated dataset.

In [None]:
import json
from pathlib import Path
from collections import Counter, defaultdict

# Param
dataset_name = "stories_val"
exp_name = "val"

# Generator LLM
gen_model = "claude-3-7-sonnet-20250219"

# Evaluator LLM
#eval_model = "gpt-4o-2024-08-06"
#eval_model = "gpt-4o-2024-11-20"
eval_model = "gpt-4.1-2025-04-14"
#eval_model = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
#eval_model = "deepseek-ai/DeepSeek-V3"

#lang = "cs"
lang = "sl"

run = 1

gt_pathname = f"../../data/validation/{lang}.json" # ground truth dataset
model_output_pathname = f"../../grb/{dataset_name}/{exp_name}/{gen_model}/{lang}/{eval_model}/{run}/000000.json" # model output
evaluated_model_output_pathname = model_output_pathname.replace("000000.json", "evaluated.json")

In [None]:
# Load JSON files
with open(gt_pathname, encoding="utf-8") as f:
    gt_data = json.load(f)

with open(model_output_pathname, encoding="utf-8") as f:
    model_data = json.load(f)

In [None]:
# Extract GRB arrays from both files
gt_grb = gt_data.get("grb", [])
model_grb = model_data.get("grb", [])

# Initialize global counters
TP_total = 0
FP_total = 0
FN_total = 0
exact_match_count = 0

# Evaluate each sentence
for gt_item, model_item in zip(gt_grb, model_grb):
    gt_analysis = gt_item["analysis"]
    model_analysis = model_item["analysis"]

    gt_counts = Counter((entry["word"].lower(), entry["class"].upper()) for entry in gt_analysis)
    model_counts = Counter((entry["word"].lower(), entry["class"].upper()) for entry in model_analysis)

    TP = sum(min(gt_counts[k], model_counts[k]) for k in gt_counts if k in model_counts)

    FP = 0
    extra_words = []
    for k, count in model_counts.items():
        if k not in gt_counts:
            FP += count
            extra_words.extend([{"word": k[0], "class": k[1]}] * count)
        else:
            diff = count - gt_counts[k]
            if diff > 0:
                FP += diff
                extra_words.extend([{"word": k[0], "class": k[1]}] * diff)

    FN = 0
    missing_words = []
    for k, count in gt_counts.items():
        if k not in model_counts:
            FN += count
            missing_words.extend([{"word": k[0], "class": k[1]}] * count)
        else:
            diff = count - model_counts[k]
            if diff > 0:
                FN += diff
                missing_words.extend([{"word": k[0], "class": k[1]}] * diff)

    misclassified = []
    model_word_map = defaultdict(list)
    for (word, c) in model_counts:
        model_word_map[word].append(c)

    for (word, gt_class), gt_count in gt_counts.items():
        if word in model_word_map and gt_class not in model_word_map[word]:
            model_wrong_classes = [c for c in model_word_map[word] if c != gt_class]
            for wrong_class in model_wrong_classes:
                mismatch_count = min(gt_counts[(word, gt_class)], model_counts[(word, wrong_class)])
                misclassified.extend([{"word": word, "class": wrong_class}] * mismatch_count)

    # Attach sentence-level evaluation
    model_item["evaluation"] = {
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "missing_words": missing_words,
        "extra_words": extra_words,
        "misclassified_words": misclassified
    }

    TP_total += TP
    FP_total += FP
    FN_total += FN

    if FP == 0 and FN == 0 and not misclassified:
        exact_match_count += 1

In [None]:
# Final stats
precision = TP_total / (TP_total + FP_total) if (TP_total + FP_total) > 0 else 0.0
recall = TP_total / (TP_total + FN_total) if (TP_total + FN_total) > 0 else 0.0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
exact_match_rate = exact_match_count / len(gt_grb) if gt_grb else 0.0

# Save stats in the top-level structure
model_data["stats"] = {
    "TP": TP_total,
    "FP": FP_total,
    "FN": FN_total,
    "Precision": round(precision, 4),
    "Recall": round(recall, 4),
    "F1": round(f1_score, 4),
    "Exact_Match_Rate": round(exact_match_rate, 4)
}

# Save output
output_path = Path(evaluated_model_output_pathname)
with output_path.open("w", encoding="utf-8") as f:
    json.dump(model_data, f, ensure_ascii=False, indent=4)

print(f"Evaluation completed. Results saved to {output_path}.")