# Calculate statistics of an annotated dataset

For a given dataset folder with annotated texts, calculate the gender representation bias (ratio of masculine to feminine references).

In [None]:
import json
from pathlib import Path
from tqdm.notebook import tqdm

# Generator LLM
gen_model = "gpt-4o-mini"
#gen_model = "google/gemma-2-27b-it"
#gen_model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
#gen_model = "mistralai/Mistral-7B-Instruct-v0.2"
#gen_model = "Qwen/Qwen2.5-7B-Instruct-Turbo"
#gen_model = "cjvt/GaMS-1B-Chat"
#gen_model = "utter-project/EuroLLM-1.7B-Instruct"

dataset_name = "stories"
exp_name = "exp1"

#lang = "cs"
lang = "sl"

# Define input/output folders
input_folder_string = f"../../grb/{dataset_name}/{exp_name}/{gen_model}/{lang}"
input_folder = Path(input_folder_string)  # dataset to be analyzed
output_folder = Path(input_folder_string.replace("/grb/", "/grb_eval/"))
summary_file = output_folder / "_stats.json"

In [None]:
# Ensure output folder exists
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
def compute_statistics(grb_data):
    stats = {"M": 0, "F": 0}
    for item in grb_data:
        for entry in item.get("analysis", []):
            if entry["class"] in stats:
                stats[entry["class"]] += 1
    return stats

In [None]:
# Process each JSON file
for json_file in tqdm(list(input_folder.glob("*.json"))):
    with json_file.open("r", encoding="utf-8") as f:
        data = json.load(f)

    grb = data.get("grb", [])
    statistics = compute_statistics(grb)
    data["statistics"] = statistics

    # Save to corresponding output file
    output_path = output_folder / json_file.name
    with output_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Summary statistics

# Initialize counts
total_M = 0
total_F = 0

# Go through all JSON files and accumulate stats
for json_file in output_folder.glob("*.json"):
    if json_file.name == "_stats.json":
        continue

    with json_file.open("r", encoding="utf-8") as f:
        data = json.load(f)
        stats = data.get("statistics", {})
        total_M += stats.get("M", 0)
        total_F += stats.get("F", 0)

# Compute ratio
ratio = total_M / total_F if total_F > 0 else float('inf')

# Save summary
summary_data = {
    "M": total_M,
    "F": total_F,
    "M_F_ratio": round(ratio, 2)
}

with summary_file.open("w", encoding="utf-8") as f:
    json.dump(summary_data, f, ensure_ascii=False, indent=4)