# Gender representation bias analysis

Annotate a given dataset for gender representation using API inference.

To annotate data for validation, uncomment the lines terminating with `# Validation`.

In [None]:
import os
import json
from pathlib import Path
from openai import OpenAI
from together import Together
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Generator LLM
#gen_model = "gpt-4o-mini"
#gen_model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
#gen_model = "google/gemma-2-27b-it"
#gen_model = "mistralai/Mistral-7B-Instruct-v0.2"
#gen_model = "Qwen/Qwen2.5-7B-Instruct-Turbo"
#gen_model = "cjvt/GaMS-1B-Chat"
gen_model = "utter-project/EuroLLM-1.7B-Instruct"
#gen_model = "claude-3-7-sonnet-20250219"  # Validation

# Evaluator LLM
#eval_model = "gpt-4o-2024-08-06"
#eval_model = "gpt-4o-2024-11-20"
eval_model = "gpt-4.1-2025-04-14"
#eval_model = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
#eval_model = "deepseek-ai/DeepSeek-V3"

dataset_name = "stories"
exp_name = "exp1"

lang = "cs"
#lang = "sl"

#run = 0  # Validation

prompt_pathname = Path(f"../../data/dataset-analysis/prompt-{lang}.txt") # prompt pattern
examples_pathname = Path(f"../../data/dataset-analysis/examples-{lang}.txt") # few-shot examples
input_folder_string = f"../../results/{dataset_name}/{exp_name}/{gen_model}/{lang}"
input_folder = Path(input_folder_string) # dataset to be analyzed
output_folder = Path(input_folder_string.replace("/results/", "/grb/"))
#output_folder = Path(os.path.join(input_folder_string.replace("/results/", "/grb/"), f"{eval_model}", f"{run}"))  # Validation

tokenizer_lang_dict = {"cs": "czech", "sl": "slovene"}

In [None]:
with prompt_pathname.open("r", encoding="utf-8") as f:
    prompt_template = f.read()
with examples_pathname.open("r", encoding="utf-8") as f:
    examples = f.read()

In [None]:
def format_prompt(sentence):
    return prompt_template.replace("<EXAMPLES>", examples).replace("<SENTENCE>", sentence)

In [None]:
def parse_analysis_response(response):
    if response.strip() == "0":
        return []
    results = []
    for line in response.strip().split("\n"):
        if len(line.strip().split(" - ")) == 2 and " - " in line:
            word, gender = line.strip().split(" - ")
            if gender in {"M", "F"}:
                results.append({"word": word.strip(), "class": gender})
    return results

In [None]:
def analyze_text(client, text):
    sentences = sent_tokenize(text, language=tokenizer_lang_dict[lang])
    grb_results = []

    for sentence in tqdm(sentences):
        prompt = format_prompt(sentence)
        response = client.chat.completions.create(
            model=eval_model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        result = response.choices[0].message.content
        analysis = parse_analysis_response(result)
        grb_results.append({"sentence": sentence, "analysis": analysis})
    return grb_results

In [None]:
if eval_model.startswith("gpt"):
    client = OpenAI()
else:
    client = Together()


output_folder.mkdir(parents=True, exist_ok=True)

for file in tqdm(list(input_folder.glob("*.json"))):
    with file.open("r", encoding="utf-8") as f:
        data = json.load(f)

    assistant_msg = next((msg["content"] for msg in data["conversation"] if msg["role"] == "assistant"), "")
    grb_data = analyze_text(client, assistant_msg)
    data["grb"] = grb_data

    with (output_folder / file.name).open("w", encoding="utf-8") as out:
        json.dump(data, out, ensure_ascii=False, indent=4)