# Gender representation bias quantification

Analysis of gender representation bias in a given dataset using Groq API inference.

In [None]:
import os
import openpyxl
from groq import Groq
from tqdm.notebook import tqdm_notebook

model = "llama-3.3-70b-versatile"
#model = "qwen-2.5-32b"
lang = "es"
#lang = "va"
prompt_pathname = f"../../data/dataset-analysis/prompt-{lang}.txt" # prompt pattern
examples_pathname = f"../../data/dataset-analysis/examples-{lang}.txt" # few-shot examples
skiplist_pathname = f"../../data/dataset-analysis/skiplist-{lang}.txt" # skip list (words to be ignored)
dataset_pathname = f"../../data/corpora-en-es/Europarl-v7.en-es.sample.01.{lang}.txt" # dataset to be analyzed
results_pathname = dataset_pathname.replace("/data/", "/results/").replace(".txt", f"_{model}_grbresults.xlsx") # results file

In [None]:
# Inference using Groq API
def inference(client, prompt):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [None]:
# Filter out any lines from the response text whose first token (before ' - ') is in the skip list
def filter_skiplist(response_text, skip_words):
    lines = response_text.splitlines()
    filtered_lines = []
    for line in lines:
        if " - " in line: # check if the line contains the expected delimiter
            word = line.split(" - ")[0].strip().upper()
            if word in skip_words:
                continue  # skip this line if the word is in the skip list
        filtered_lines.append(line)
    return "\n".join(filtered_lines)

In [None]:
def create_excel_with_analysis(client, prompt_pattern, examples, sentences, results_pathname, skiplist=None):
    # Create a new workbook and select the active worksheet
    workbook = openpyxl.Workbook()
    worksheet = workbook.active
    
    # Add headers to the worksheet
    worksheet.append(["ID", "Sentence", "Analysis"])
    
    # Iterate through the sentences and perform analysis
    for i, sentence in tqdm_notebook(enumerate(sentences, start=1), total=len(sentences)):
        prompt = prompt_pattern.replace("<EXAMPLES>", examples).replace("<SENTENCE>", sentence[:-1]) # [:-1] to remove trailing newline character
        response = inference(client, prompt)

        # Filter out skiplist words from the response
        filtered_response = filter_skiplist(response, skiplist) if skiplist is not None else response
        
        # Append the sentence and its analysis to the worksheet
        worksheet.append([i, sentence[:-1], filtered_response])
        
        # Save the workbook after every line
        workbook.save(results_pathname)

In [None]:
# Initialize statistics
def initialize_counters():
    return {
        'total_words': 0,
        'count_M': 0,
        'count_F': 0,
        'count_N': 0,
        'count_P': 0,
        'count_P_M': 0,
        'count_P_F': 0
    }

In [None]:
# Update statistics with a single sentence analysis
def update_counters(counters, sentence_analysis):
    words = sentence_analysis.split('\n')
    counters['total_words'] += len(words)
    
    for word in words:
        if ', M' in word:
            counters['count_M'] += 1
        if ', F' in word:
            counters['count_F'] += 1
        if 'N, ' in word:
            counters['count_N'] += 1
        if 'P, ' in word:
            counters['count_P'] += 1
        if 'P, M' in word:
            counters['count_P_M'] += 1
        if 'P, F' in word:
            counters['count_P_F'] += 1

In [None]:
# Analyze the saved Excel file
def analyze_excel_file(pathname):
    workbook = openpyxl.load_workbook(pathname)
    worksheet = workbook.active
    
    counters = initialize_counters()

    for row in worksheet.iter_rows(min_row=2, values_only=True):
        sentence_analysis = row[2]
        update_counters(counters, sentence_analysis)

    return counters

In [None]:
def print_results(counters, pathname=None):
    # Prepare statistics output
    summary = (f"Total number of identified words: {counters['total_words']}\n"
               f"Number of words that are *, M: {counters['count_M']}\n"
               f"Number of words that are *, F: {counters['count_F']}\n"
               f"Number of words that are N, *: {counters['count_N']}\n"
               f"Number of words that are P, *: {counters['count_P']}\n"
               f"Number of words that are P, M: {counters['count_P_M']}\n"
               f"Number of words that are P, F: {counters['count_P_F']}\n"
               f"Ratio (P, M) : (P, F): {counters['count_P_M']/counters['count_P_F'] if counters['count_P_F'] != 0 else float('inf'):.2f} : 1")

    print(summary)  # Print to terminal
    if pathname is not None:
        with open(pathname, 'w') as f:
            print(summary, file=f)  # Print to file

In [None]:
# Read files
with open(examples_pathname, 'r', encoding="utf-8") as examples_file:
    examples = examples_file.read()

with open(dataset_pathname, 'r', encoding="utf-8") as dataset_file:
    sentences = dataset_file.readlines()

with open(prompt_pathname, 'r', encoding="utf-8") as prompt_file:
    prompt_pattern = prompt_file.read()

with open(skiplist_pathname, 'r', encoding='utf-8') as skiplist_file:
    if skiplist_pathname is not None:
        skiplist = {line.strip().upper() for line in skiplist_file if line.strip()}

In [None]:
# Run inference
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
os.makedirs(os.path.dirname(results_pathname), exist_ok=True)
create_excel_with_analysis(client, prompt_pattern, examples, sentences, results_pathname, skiplist)

In [None]:
# Analysis summary (statistics)
stats = analyze_excel_file(results_pathname)
print_results(stats, results_pathname.replace(".xlsx", ".txt"))
os.path.basename(results_pathname)