In [14]:
import os
from IPython.display import display
import pandas as pd
import json
from datasets import load_dataset
import numpy as np
from comet import download_model, load_from_checkpoint
# from comet.metrics import COMET


In [15]:
comet_metric = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))


Fetching 5 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10727.12it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../home/manos/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [16]:
# https://github.com/amazon-science/machine-translation-gender-eval/blob/main/accuracy_metric.py
def get_words(line):
    """
    Helper function to get the set of words in a line.

    :param line: Line from which to get the words.
    :return: Set of words in the line.
    """
    return set(line.strip().split())


def get_trg_correct_incorrect(trg_line, orig_ref, ctf_ref):
    """
    Compute overlap between references and translation
    We first get unique words in each of the references w.r.t each other then we compute their overlap with target
    """
    # get words for each segment
    trg_words, orig_words, ctf_words = (
        get_words(trg_line),
        get_words(orig_ref),
        get_words(ctf_ref),
    )
    # get unique words in each of the references
    orig_unique = orig_words - ctf_words
    ctf_unique = ctf_words - orig_words
    # now check the words in the target sentence for overlap with incorrect unique words
    trg_correct = trg_words & orig_unique
    trg_incorrect = trg_words & ctf_unique
    return trg_correct, trg_incorrect


def gender_decision(trg_line, orig_ref, ctf_ref):
    """
    Check if gender of a sentence is correct based on corresponding correct and incorrect references.
    Algorithm: We make decision based on whether hyp overlaps with original ref and counterfactual ref

    :param trg_line: Sentence from translation output for which to check gender.
    :param orig_ref: Original (Correct) reference.
    :param ctf_ref: Counterfactual reference.
    :return: a list of decision, overlap(hyp, original ref), overlap(hyp, counterfactual ref)
    """
    trg_correct, trg_incorrect = get_trg_correct_incorrect(trg_line, orig_ref, ctf_ref)

    if trg_incorrect:
        decision = "Incorrect"
    else:
        if trg_correct:
            decision = "Correct"
        else:
            decision = "None"

    return decision


In [17]:
def read_gender_data(language):
    gender_info = pd.read_csv(f"./data/mtgeneval/context_genders/geneval-context-en{language}-genders.txt", sep="\t")
    return gender_info    

    

def read_translations(language:str, metric_name=None):
    if metric_name is None:
        outputs_path = f"./results-copied/translations-qad-ambiguous/{language}/Unbabel--TowerInstruct-7B-v0.2/greedy/generations.json"
    else:
        outputs_path = f"./results-copied/translations-qad-ambiguous/{language}/Unbabel--TowerInstruct-7B-v0.2/reranking/{metric_name}/generations.json"
    with open(outputs_path, "r",encoding='utf-8') as f:
        data = json.load(f)
    sources = [sample['input'] for sample in data]
    outs = [sample['outputs'][0] for sample in data]

    return sources,outs

    

In [32]:
langs = ["it" , "de", "es", "pt", "fr", "ru"]
metric_names = [None, "Unbabel--wmt22-cometkiwi-da" , "Unbabel--wmt23-cometkiwi-da-xxl"]

# read contextual data from mtgeneval
all_data_df = []
scores = []
for lang in langs:
    print(lang)
    mydata =  load_dataset("gsarti/mt_geneval", f"context_en_{lang}", split="test").to_pandas()
    gender_info = pd.read_csv(f"./data/mtgeneval/context_genders/geneval-context-en{lang}-genders.txt", sep="\t")
    # df = mydata.copy()
    # df["language"] = lang
    # df["gender"] = gender_info["gender"]
    # df["reference_male"] = df.apply(lambda x: x["reference_original"] if x["gender"] == "male" else x["reference_flipped"], axis=1)
    # df["reference_female"] = df.apply(lambda x: x["reference_original"] if x["gender"] == "female" else x["reference_flipped"], axis=1)
    # Create base dataframe with common columns
    base_df = mydata.copy()
    base_df["language"] = lang
    base_df["gender"] = gender_info["gender"]
    base_df["reference_male"] = base_df.apply(lambda x: x["reference_original"] if x["gender"] == "male" else x["reference_flipped"], axis=1)
    base_df["reference_female"] = base_df.apply(lambda x: x["reference_original"] if x["gender"] == "female" else x["reference_flipped"], axis=1)
    
    # print(df.columns)
    for metric in metric_names:
        print(metric)
        df = base_df.copy()
        sources, translations = read_translations(lang, metric_name=metric)
        if metric is not None:
            df["type"] = f"reranking-{metric}"
        else:
            df["type"] = "greedy"
        comet_f = comet_metric.predict([{"mt": y, "ref":z, "src": x} 
                                                for x, y, z in zip(sources,translations, df["reference_female"])], 
                                                batch_size=256, gpus=1, progress_bar=True, devices=[0])['scores']
        comet_m = comet_metric.predict([{"mt": y, "ref":z, "src": x} 
                                                for x, y, z in zip(sources,translations, df["reference_male"])], 
                                                batch_size=256, gpus=1, progress_bar=True, devices=[0])['scores']
        gender_decisions = [gender_decision(trg_line, orig_ref, ctf_ref) for (trg_line, orig_ref, ctf_ref) in zip(translations, df["reference_male"], df["reference_female"])]
        df["gender_decision"] = gender_decisions
        df["comet_f"] = comet_f
        df["comet_m"] = comet_m

        # scores.append([lang, df["type"].iloc[0], comet_f, comet_m, gender_decisions.count('Correct')/len(gender_decisions),  gender_decisions.count('Incorrect')/len(gender_decisions)])
        all_data_df.append(df)

all_data_df = pd.concat(all_data_df)
all_data_df.head(10)
    

it


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.08s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

de


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.10s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.12s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

es


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.10s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.12s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

pt


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.06s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.08s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.06s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

fr


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.19s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.20s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.19s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

ru


/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


None


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.16s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt22-cometkiwi-da


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.17s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unbabel--wmt23-cometkiwi-da-xxl


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.16s/it]
/mnt/data-poseidon/manos/gender_bias_qe/venvs/venv-gender-bias-qe-final/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /mnt/data-poseidon/manos/gender_bias_qe/venvs/venv- ...
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████

Unnamed: 0,orig_id,context,source,reference_original,reference_flipped,language,gender,reference_male,reference_female,type,gender_decision,comet_f,comet_m
0,0,"During the 1870s, several incidents made Alexa...",The first incident involved Seligman's declini...,Il primo incidente fu dovuto al rifiuto da par...,Il primo incidente fu dovuto al rifiuto da par...,it,male,Il primo incidente fu dovuto al rifiuto da par...,Il primo incidente fu dovuto al rifiuto da par...,greedy,Correct,0.820915,0.84802
1,1,He then went to Sydney and then Melbourne hold...,"Evergood was a capable artist, who mostly pain...","Evergood fu un artista capace, che dipinse pri...","Evergood fu un'artista capace, che dipinse pri...",it,male,"Evergood fu un artista capace, che dipinse pri...","Evergood fu un'artista capace, che dipinse pri...",greedy,Correct,0.939206,0.940351
2,2,His work was originally designed for the gate ...,"The first design was ready in 1970 but, due to...","La prima versione era pronta nel 1970 ma, a ca...","La prima versione era pronta nel 1970 ma, a ca...",it,male,"La prima versione era pronta nel 1970 ma, a ca...","La prima versione era pronta nel 1970 ma, a ca...",greedy,,0.880402,0.916238
3,3,Motl is described as having a natural talent f...,Some episodes indicate that the author intende...,Alcuni episodi sembrano indicare che l'autore ...,Alcuni episodi sembrano indicare che l'autrice...,it,male,Alcuni episodi sembrano indicare che l'autore ...,Alcuni episodi sembrano indicare che l'autrice...,greedy,Correct,0.898696,0.89323
4,4,In 1866 he became an instructor in palaeontolo...,"Although an excellent teacher, and especially ...","Nonostante fosse un professore eccellente, un ...","Nonostante fosse una professoressa eccellente,...",it,male,"Nonostante fosse un professore eccellente, un ...","Nonostante fosse una professoressa eccellente,...",greedy,Correct,0.844036,0.895334
5,5,It was found that three cooks had prepared the...,The lesions on the cook’s fingers were found t...,Si scoprì che le lesioni sulle dita del cuoco ...,Si scoprì che le lesioni sulle dita della cuoc...,it,male,Si scoprì che le lesioni sulle dita del cuoco ...,Si scoprì che le lesioni sulle dita della cuoc...,greedy,Incorrect,0.942081,0.935867
6,6,Since a journal is written from the perspectiv...,"The form lends itself to plotlessness, since t...",La forma tende a mettere in secondo piano la t...,La forma tende a mettere in secondo piano la t...,it,male,La forma tende a mettere in secondo piano la t...,La forma tende a mettere in secondo piano la t...,greedy,Correct,0.854829,0.869448
7,7,Various attempts have been made to clear up th...,Methodius had a comprehensive philosophical ed...,Metodio aveva una formazione filosofica comple...,Metodio aveva una formazione filosofica comple...,it,male,Metodio aveva una formazione filosofica comple...,Metodio aveva una formazione filosofica comple...,greedy,Correct,0.894835,0.950736
8,8,"In 1849, he was appointed to a teaching post i...","Fabre was a popular teacher, physicist, chemis...","Fabre era un insegnante popolare, fisico, chim...","Fabre era un'insegnante popolare, fisica, chim...",it,male,"Fabre era un insegnante popolare, fisico, chim...","Fabre era un'insegnante popolare, fisica, chim...",greedy,Correct,0.88323,0.938811
9,9,Though baptised as Emilio Stanley by Italian m...,Kibaki turned out to be an exemplary student.,Kibaki si rivelò uno studente esemplare.,Kibaki si rivelò una studentessa esemplare.,it,male,Kibaki si rivelò uno studente esemplare.,Kibaki si rivelò una studentessa esemplare.,greedy,Correct,0.881943,0.910661


In [33]:
all_data_df['type'].unique()

array(['greedy', 'reranking-Unbabel--wmt22-cometkiwi-da',
       'reranking-Unbabel--wmt23-cometkiwi-da-xxl'], dtype=object)

In [35]:
for group, df in all_data_df.groupby(["type"]):
    print(group)
    print("comet_f: ", np.mean(df["comet_f"]*100))
    print("comet_m: ", np.mean(df["comet_m"]*100))
    df_correct = df[df["gender_decision"] == "Correct"]
    df_incorrect = df[df["gender_decision"] == "Incorrect"]
    print("Accuracy_M: ", round(df_correct.shape[0]/df.shape[0], 2), "Accuracy_F: ", round(df_incorrect.shape[0]/df.shape[0], 2))

('greedy',)
comet_f:  86.7316002191001
comet_m:  88.45331172390019
Accuracy_M:  0.6 Accuracy_F:  0.17
('reranking-Unbabel--wmt22-cometkiwi-da',)
comet_f:  87.66178641911154
comet_m:  89.38722326767122
Accuracy_M:  0.6 Accuracy_F:  0.15
('reranking-Unbabel--wmt23-cometkiwi-da-xxl',)
comet_f:  87.46386991514193
comet_m:  89.10848779740323
Accuracy_M:  0.58 Accuracy_F:  0.17


In [36]:
# group by language and type

for group, df in all_data_df.groupby(["language", "type"]):
    print(group)
    print("comet_f: ", np.mean(df["comet_f"]*100))
    print("comet_m: ", np.mean(df["comet_m"]*100))
    df_correct = df[df["gender_decision"] == "Correct"]
    df_incorrect = df[df["gender_decision"] == "Incorrect"]
    print("Accuracy_M: ", round(df_correct.shape[0]/df.shape[0], 2), "Accuracy_F: ", round(df_incorrect.shape[0]/df.shape[0], 2))


('de', 'greedy')
comet_f:  85.79945681311868
comet_m:  87.55245886065744
Accuracy_M:  0.6 Accuracy_F:  0.14
('de', 'reranking-Unbabel--wmt22-cometkiwi-da')
comet_f:  87.09587809172544
comet_m:  88.69929933006114
Accuracy_M:  0.61 Accuracy_F:  0.13
('de', 'reranking-Unbabel--wmt23-cometkiwi-da-xxl')
comet_f:  86.7828196070411
comet_m:  88.32224016297947
Accuracy_M:  0.58 Accuracy_F:  0.16
('es', 'greedy')
comet_f:  87.5860335074202
comet_m:  89.38067698456945
Accuracy_M:  0.65 Accuracy_F:  0.19
('es', 'reranking-Unbabel--wmt22-cometkiwi-da')
comet_f:  87.66792303454267
comet_m:  89.50606274561291
Accuracy_M:  0.66 Accuracy_F:  0.17
('es', 'reranking-Unbabel--wmt23-cometkiwi-da-xxl')
comet_f:  87.66287116564973
comet_m:  89.27949787708965
Accuracy_M:  0.63 Accuracy_F:  0.21
('fr', 'greedy')
comet_f:  85.73130448641183
comet_m:  87.79363691047499
Accuracy_M:  0.63 Accuracy_F:  0.15
('fr', 'reranking-Unbabel--wmt22-cometkiwi-da')
comet_f:  86.35298172054343
comet_m:  88.42315870160076
Accu

In [None]:
for gr, gr_df in scores_df.groupby(["lang"]):
    display(gr_df)