In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
# os.chdir("../")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from notebooks.utils import gini
from scipy.stats import mannwhitneyu, pearsonr, wilcoxon
from pprint import pprint
import numpy as np
import matplotlib as mpl
from src import eval_metrics

sns.set_theme("paper", style="whitegrid")
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = "lightgray"
plt.rcParams["grid.linestyle"] = "--"
plt.rcParams["grid.linewidth"] = 0.5

plt.rc("text", usetex=False)


In [5]:
import seaborn as sns
plt.style.use("seaborn-v0_8-whitegrid")


In [25]:
DATASET = "MTGenEval"
MODELS = {

    "Kiwi 22": {
        "model_id": "Unbabel--wmt22-cometkiwi-da",
        "type": "neural_metric"
    },
    "Kiwi 23 XL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xl",
        "type": "neural_metric"
    },
    "Kiwi 23 XXL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xxl",
        "type": "neural_metric"
    },
    "xCOMET XL": {
        "model_id": "Unbabel--XCOMET-XL",
        "type": "neural_metric"
    },
    "xCOMET XXL": {
        "model_id": "Unbabel--XCOMET-XXL",
        "type": "neural_metric"
    },
    "MetricX23 LARGE": {
        "model_id": "google--metricx-23-qe-large-v2p0",
        "type": "neural_metric"
    },
    "MetricX23 XL": {
        "model_id": "google--metricx-23-qe-xl-v2p0",
        "type": "neural_metric"
    },

    "Llama 3.1 70B": {
        "model_id": "meta-llama--Meta-Llama-3.1-70B-Instruct",
        "type": "LLM"
    },
    
    "Mistral 7B": {
        "model_id": "mistralai--Mistral-7B-Instruct-v0.2",
        "type": "LLM"
    },

  
    "Gemma 2 9B": {
        "model_id": "google--gemma-2-9b-it",
        "type": "LLM"
    },

    "GPT 4": {
        "model_id": "gpt-4o-2024-05-13",
        "type": "LLM"
    }
}
           
LANGS = ["it","es","de","pt","ar","fr","hi","ru"]

tot = list()

#define the following cases according to how context is used in hypothesis:
# - with-original-contexts -- the source context is used (prepended) in the hypothesis
# - with-translated-contexts -- the source context translated into the target lang is used (prepended) in the hypothesis
metrics_context_options = {"with-original-contexts":"original ctx","with-translated-contexts":"translated ctx"}
translation_model_dict = {"facebook--nllb-200-3.3B":"NLLB 3.3B","google-translate":"GT" }
llms_ctx_options = {"standard":"standard ctx","paraphrased":"paraphrased ctx"}

for lang in LANGS:
    # open source gender info file
    gender_info = pd.read_csv(f"./data/mtgeneval/context_genders/geneval-context-en{lang}-genders.txt", sep="\t")
    for m, info in MODELS.items():
        if info["type"]== "neural_metric":
            for ctx_v,ctx_n in metrics_context_options.items():
                if ctx_v=="with-translated-contexts":
                    for trans_v,trans_m in translation_model_dict.items():
                        res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/{trans_v}/scores.csv")
                        res["model"] = m+"--"+ctx_n +"--"+trans_m
                        res["lang"] = lang
                        res["src_gender"] = gender_info["gender"]
                        res["type"] = info["type"]
                        tot.append(res)
                else:
                    res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/scores.csv")
                    res["model"] = m+"--"+ctx_n 
                    res["lang"] = lang
                    res["src_gender"] = gender_info["gender"]
                    res["type"] = info["type"]
                    tot.append(res)
        elif info["type"]=="LLM":
            for ctx_v,ctx_n in llms_ctx_options.items():
                if m=="GPT 4" and ctx_n=="paraphrased ctx":
                    continue # we skip this version (as it is not done for GPT4)
                res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/scores.csv")
                res["model"] = m +"--" + ctx_n
                res["lang"] = lang
                res["src_gender"] = gender_info["gender"]
                res["type"] = info["type"]
                tot.append(res)
        else:
            assert False,"Type error!"
res = pd.concat(tot)
data_all = res
data_all = data_all.rename(columns={"score_Original":"score_original", "score_Flipped":"score_flipped"})



In [None]:

final_df = list()
original_shapes = {}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    
    original_shape = subdf.shape
    # print(lang,model ,original_shape)
    if lang not in original_shapes:
        original_shapes[lang]=original_shape
        
    # print(lang, model, original_shape)
    filtered = subdf.copy()
    filtered = filtered.dropna(subset=["score_original", "score_flipped"])
    # rescale scores of LLMs from 0-100 to 0-1
    if filtered.iloc[0]["type"] == "LLM":
        filtered["score_original"] = filtered["score_original"] / 100
        filtered["score_flipped"] = filtered["score_flipped"] / 100
    
        # filter out all rows that have scores outside the range [0, 1]
        filtered = filtered.loc[
            (filtered["score_original"] > 0.1) & \
            (filtered["score_original"] <= 1) & \
            (filtered["score_flipped"] > 0.1) & \
            (filtered["score_flipped"] <= 1)
        ]
    elif "MetricX" in filtered.iloc[0]["model"]:
        filtered["score_original"] = 1 - filtered["score_original"] / 25
        filtered["score_flipped"] = 1 - filtered["score_flipped"] / 25
    else:
        #no filtering needed for classical neural metrics!
        pass
    final_df.append(filtered)

data_all = pd.concat(final_df).copy()


# Compute the intersection of rows to be kept for each language
keep_rows_inter = {}
for lang, lang_df in data_all.groupby("lang"):
    common_rows = set(lang_df.index)
    for model, model_df in lang_df.groupby("model"):
        common_rows.intersection_update(set(model_df.index))
    keep_rows_inter[lang] = list(common_rows)


for lang in keep_rows_inter:
    print(f"Language: {lang}  # {original_shapes[lang][0]}  samples before taking intersection of models. ")
    print(f"Language: {lang} kept # {len(keep_rows_inter[lang])}  samples (after intersection between all models). ")

# then we filter again!
final_df=[]
removed_info={lang:{} for lang in LANGS}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    filtered=subdf.copy()
    filtered = filtered.loc[keep_rows_inter[lang]]
    removed_info[lang][model] =  100 * (1 - filtered.shape[0] / original_shapes[lang][0])

    final_df.append(filtered)
data_all = pd.concat(final_df).copy()
data_all["score_M"] = data_all.apply(lambda x: x["score_original"] if x["src_gender"] == "male" else x["score_flipped"], axis=1)
data_all["score_F"] = data_all.apply(lambda x: x["score_original"] if x["src_gender"] == "female" else x["score_flipped"], axis=1)

#we remove the GT based models (ones that use google translate for translating the context).This is done only for this part of analysis
data_all = data_all[~data_all['model'].str.contains('GT', case=True, na=False)]
data_all.head(3)


Language: ar  # 1100  samples before taking intersection of models. 
Language: ar kept # 904  samples (after intersection between all models). 
Language: de  # 1100  samples before taking intersection of models. 
Language: de kept # 913  samples (after intersection between all models). 
Language: es  # 1096  samples before taking intersection of models. 
Language: es kept # 1002  samples (after intersection between all models). 
Language: fr  # 1099  samples before taking intersection of models. 
Language: fr kept # 968  samples (after intersection between all models). 
Language: hi  # 1098  samples before taking intersection of models. 
Language: hi kept # 969  samples (after intersection between all models). 
Language: it  # 1094  samples before taking intersection of models. 
Language: it kept # 954  samples (after intersection between all models). 
Language: pt  # 1089  samples before taking intersection of models. 
Language: pt kept # 985  samples (after intersection between all m

Unnamed: 0,ID,context,source,reference_original,reference_flipped,score_original,score_flipped,model,lang,src_gender,type,score_M,score_F
2,2,"Slap-guitarist Miyavi, who has his roots in vi...",It was the most successful international tour ...,كانت من أنجح الرحلات الدولية التي نفذها فنان ي...,كانت من أنجح الرحلات الدولية التي نفذتها فنانة...,0.95,0.7,GPT 4--standard ctx,ar,male,LLM,0.95,0.7
3,3,Whittier was heavily influenced by the doctrin...,Whittier was first introduced to poetry by a t...,تعرّف ويتري على الشعر لأول مرة عن طريق معلم.,تعرّفت ويتري على الشعر لأول مرة عن طريق معلم.,0.95,0.85,GPT 4--standard ctx,ar,male,LLM,0.95,0.85
5,5,"He was later appointed as a Civil, Commercial ...",Guinle entered politics as Government Secretar...,دخل Guinle السياسة كسكرتير المحافظة لبلدية Com...,دخلت Guinle السياسة كسكرتير المحافظة لبلدية Co...,0.75,0.6,GPT 4--standard ctx,ar,male,LLM,0.75,0.6


In [28]:
removed_info = pd.DataFrame(removed_info)
removed_info

Unnamed: 0,it,es,de,pt,ar,fr,hi,ru
GPT 4--standard ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Gemma 2 9B--paraphrased ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Gemma 2 9B--standard ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 22--original ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 22--translated ctx--GT,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 22--translated ctx--NLLB 3.3B,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 23 XL--original ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 23 XL--translated ctx--GT,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 23 XL--translated ctx--NLLB 3.3B,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727
Kiwi 23 XXL--original ctx,12.797075,8.576642,17.0,9.550046,17.818182,11.919927,11.748634,12.272727


Probability Analysis - Now we compute the continuous metrics along with their statistical significance!

In [31]:


stats_list = list()
for (model, lang), subdf in data_all.groupby(["model","lang"]):

    n_male = (subdf["src_gender"]=="male").sum()
    n_female = (subdf["src_gender"]=="female").sum()

    ratios,p_val_1samp_ratio,ratio_one_sample_test_less = eval_metrics.Ratio({"F": subdf["score_F"], "M": subdf["score_M"]}).score()
    d = dict(
        model=model,
        lang=lang,
        score_M_mean=subdf["score_M"].mean(),
        score_M_std=subdf["score_M"].std(),
        score_F_mean=subdf["score_F"].mean(),
        score_F_std=subdf["score_F"].std(),
        ratio_mean = ratios.mean(),
        ratio_std = ratios.std(),
        #1sample test results -- for ratio
        p_val_1samp_ratio = p_val_1samp_ratio,
        sign_1samp_ratio = ratio_one_sample_test_less,
    )

    stats_list.append(d)

stats_df = pd.DataFrame(stats_list).reset_index()
stats_df = stats_df.drop('index',axis=1)


  sqr = _ensure_numeric((avg - values) ** 2)


In [32]:
stats_df_pivot = stats_df.pivot_table(index=["model"],columns=["lang"],
                              values=["score_M_mean",
                                      "score_M_std",
                                      "score_F_mean",
                                      "score_F_std",
                                      "ratio_mean",
                                      "ratio_std",
                                      "p_val_1samp_ratio",
                                      "sign_1samp_ratio"
                                     ])

In [33]:
# check for None values:
stats_df[stats_df.isnull().any(axis=1)]


Unnamed: 0,model,lang,score_M_mean,score_M_std,score_F_mean,score_F_std,ratio_mean,ratio_std,p_val_1samp_ratio,sign_1samp_ratio
96,MetricX23 LARGE--translated ctx--NLLB 3.3B,ar,0.389957,0.213707,0.387581,0.211442,inf,,,False
112,MetricX23 XL--translated ctx--NLLB 3.3B,ar,0.05337,0.176147,0.055079,0.17907,inf,,,False
114,MetricX23 XL--translated ctx--NLLB 3.3B,es,0.923873,0.069902,0.919592,0.071475,0.995855,0.033781,,False


In [35]:
import os
os.makedirs('./results-copied/stats/nonambiguous-contextual/probability-analysis', exist_ok=True)
stats_path = './results-copied/stats/nonambiguous-contextual/probability-analysis/results.csv'
stats_df.to_csv(stats_path, index=False)


#### (Prediction-view) Hard binary selection analysis 
We compute the following for each model and language:
- Total error rate ER
- Error Rate for feminine and masculine sources; ER(S^F), ER(S^M)
- Error Rate Ratio Φ
- Statistical significance of Φ using bootstrap resampling

For the aggregated results across languages and the tables of the paper check make_figs_contextual_v2.ipynb


In [36]:

acc_list = list()
for (model, lang), subdf in data_all.groupby(["model","lang"]):
    print(model,lang)
    acc = eval_metrics.Accuracy({"F": np.array(subdf["score_F"]), "M": np.array(subdf["score_M"]),"y_true":np.array(subdf["src_gender"])}).score()
    group_metrics = eval_metrics.GroupMetricsBootstraping({"F": np.array(subdf["score_F"]), "M": np.array(subdf["score_M"]),"y_true":np.array(subdf["src_gender"])},alternative="greater").stat_significance_with_paired_bootstrap()

    d = dict(
        model=model,
        lang=lang,
        acc_total = acc,
        error_rate_total = group_metrics["results"]["total_error_rate"],
        error_rate_male = group_metrics["results"]["male"]["fnr"],
        error_rate_fem = group_metrics["results"]["female"]["fnr"],
        error_rate_diff =    group_metrics["results"]["fnr_diff"],
        error_rate_ratio =  group_metrics["results"]["fnr_ratio"],
        stat_significance = group_metrics["stat_significance"],
        p_value = group_metrics["p_value"],
    )

    acc_list.append(d)

acc_df = pd.DataFrame(acc_list)



GPT 4--standard ctx ar
GPT 4--standard ctx de
GPT 4--standard ctx es
GPT 4--standard ctx fr
GPT 4--standard ctx hi
GPT 4--standard ctx it
GPT 4--standard ctx pt
GPT 4--standard ctx ru
Gemma 2 9B--paraphrased ctx ar
Gemma 2 9B--paraphrased ctx de
Gemma 2 9B--paraphrased ctx es
Gemma 2 9B--paraphrased ctx fr
Gemma 2 9B--paraphrased ctx hi
Gemma 2 9B--paraphrased ctx it
Gemma 2 9B--paraphrased ctx pt
Gemma 2 9B--paraphrased ctx ru
Gemma 2 9B--standard ctx ar
Gemma 2 9B--standard ctx de
Gemma 2 9B--standard ctx es
Gemma 2 9B--standard ctx fr
Gemma 2 9B--standard ctx hi
Gemma 2 9B--standard ctx it
Gemma 2 9B--standard ctx pt
Gemma 2 9B--standard ctx ru
Kiwi 22--original ctx ar
Kiwi 22--original ctx de
Kiwi 22--original ctx es
Kiwi 22--original ctx fr
Kiwi 22--original ctx hi
Kiwi 22--original ctx it
Kiwi 22--original ctx pt
Kiwi 22--original ctx ru
Kiwi 22--translated ctx--NLLB 3.3B ar
Kiwi 22--translated ctx--NLLB 3.3B de
Kiwi 22--translated ctx--NLLB 3.3B es
Kiwi 22--translated ctx--NLLB 

In [37]:
acc_df.isnull().any()

model                False
lang                 False
acc_total            False
error_rate_total     False
error_rate_male      False
error_rate_fem       False
error_rate_diff      False
error_rate_ratio     False
stat_significance    False
p_value              False
dtype: bool

In [38]:
acc_df_pivot = acc_df.pivot_table(index=["model"],columns=["lang"],
                              values=["acc_total",
                                      "error_rate_total",
                                      "error_rate_male",
                                      "error_rate_fem",
                                      "error_rate_diff",
                                      "error_rate_ratio",
                                      "stat_significance",
                                      "p_value"
                                     ])
os.makedirs('./results-copied/stats/nonambiguous-contextual/prediction-analysis', exist_ok=True)
acc_df_path = './results-copied/stats/nonambiguous-contextual/prediction-analysis/results.csv'
acc_df.to_csv(acc_df_path, index=False)


### Measure ties
Measure ties for the contextual non-ambiguous case. Used to create Table 10 of the paper. For table creation check (make_figs_contextual_v2.ipynb)

In [39]:
ties = []
for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    ties.append({"lang": lang, "model": model,"type":subdf.iloc[0]["type"],"ties": 100*(subdf["score_M"]==subdf["score_F"]).sum()/len(subdf)})
ties = pd.DataFrame(ties)
ties

Unnamed: 0,lang,model,type,ties
0,ar,GPT 4--standard ctx,LLM,44.800885
1,ar,Gemma 2 9B--paraphrased ctx,LLM,55.420354
2,ar,Gemma 2 9B--standard ctx,LLM,53.429204
3,ar,Kiwi 22--original ctx,neural_metric,0.000000
4,ar,Kiwi 22--translated ctx--NLLB 3.3B,neural_metric,0.000000
...,...,...,...,...
163,ru,Mistral 7B--standard ctx,LLM,85.492228
164,ru,xCOMET XL--original ctx,neural_metric,0.207254
165,ru,xCOMET XL--translated ctx--NLLB 3.3B,neural_metric,0.103627
166,ru,xCOMET XXL--original ctx,neural_metric,0.518135


In [40]:
import os
os.makedirs('./results-copied/stats/nonambiguous-contextual/ties', exist_ok=True)
ties_path_non_amb = './results-copied/stats/nonambiguous-contextual/ties/results.csv'
ties.to_csv(ties_path_non_amb, index=False)

#### Comparison Between BIL models:
This creates the results for making Figure 9 of the paper. Comparing models with translated context by NLLB and GT. (for figure check make_figs_contextual_v2.ipynb)

In [41]:
MODELS = {

    "Kiwi 22": {
        "model_id": "Unbabel--wmt22-cometkiwi-da",
        "type": "neural_metric"
    },
    "Kiwi 23 XL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xl",
        "type": "neural_metric"
    },
    "Kiwi 23 XXL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xxl",
        "type": "neural_metric"
    },
    "xCOMET XL": {
        "model_id": "Unbabel--XCOMET-XL",
        "type": "neural_metric"
    },
    "xCOMET XXL": {
        "model_id": "Unbabel--XCOMET-XXL",
        "type": "neural_metric"
    },
    "MetricX23 LARGE": {
        "model_id": "google--metricx-23-qe-large-v2p0",
        "type": "neural_metric"
    },
    "MetricX23 XL": {
        "model_id": "google--metricx-23-qe-xl-v2p0",
        "type": "neural_metric"
    },

    "Llama 3.1 70B": {
        "model_id": "meta-llama--Meta-Llama-3.1-70B-Instruct",
        "type": "LLM"
    },
    
    "Mistral 7B": {
        "model_id": "mistralai--Mistral-7B-Instruct-v0.2",
        "type": "LLM"
    },

  
    "Gemma 2 9B": {
        "model_id": "google--gemma-2-9b-it",
        "type": "LLM"
    },

    "GPT 4": {
        "model_id": "gpt-4o-2024-05-13",
        "type": "LLM"
    }
}
           
LANGS = ["it","es","de","pt","ar","fr","hi","ru"]

tot = list()

#define the following cases according to how context is used in hypothesis:
# - with-original-contexts -- the source context is used (prepended) in the hypothesis
# - with-translated-contexts -- the source context translated into the target lang is used (prepended) in the hypothesis
metrics_context_options = {"with-original-contexts":"original ctx","with-translated-contexts":"translated ctx"}
translation_model_dict = {"facebook--nllb-200-3.3B":"NLLB 3.3B","google-translate":"GT" }
llms_ctx_options = {"standard":"standard ctx","paraphrased":"paraphrased ctx"}

for lang in LANGS:
    # open source gender info file
    gender_info = pd.read_csv(f"./data/mtgeneval/context_genders/geneval-context-en{lang}-genders.txt", sep="\t")
    for m, info in MODELS.items():
        if info["type"]== "neural_metric":
            for ctx_v,ctx_n in metrics_context_options.items():
                if ctx_v=="with-translated-contexts":
                    for trans_v,trans_m in translation_model_dict.items():
                        res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/{trans_v}/scores.csv")
                        res["model"] = m+"--"+ctx_n +"--"+trans_m
                        res["lang"] = lang
                        res["src_gender"] = gender_info["gender"]
                        res["type"] = info["type"]
                        tot.append(res)
                else:
                    res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/scores.csv")
                    res["model"] = m+"--"+ctx_n 
                    res["lang"] = lang
                    res["src_gender"] = gender_info["gender"]
                    res["type"] = info["type"]
                    tot.append(res)
        elif info["type"]=="LLM":
            for ctx_v,ctx_n in llms_ctx_options.items():
                if m=="GPT 4" and ctx_n=="paraphrased ctx":
                    continue # we skip this version (as it is not done for GPT4)
                res = pd.read_csv(f"./results-copied/scores/nonambiguous-contextual/mtgeneval/{lang}/{info['model_id']}/{ctx_v}/scores.csv")
                res["model"] = m +"--" + ctx_n
                res["lang"] = lang
                res["src_gender"] = gender_info["gender"]
                res["type"] = info["type"]
                tot.append(res)
        else:
            assert False,"Type error!"
res = pd.concat(tot)
data_all = res
data_all = data_all.rename(columns={"score_Original":"score_original", "score_Flipped":"score_flipped"})



In [42]:

final_df = list()
original_shapes = {}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    
    original_shape = subdf.shape
    # print(lang,model ,original_shape)
    if lang not in original_shapes:
        original_shapes[lang]=original_shape
        
    # print(lang, model, original_shape)
    filtered = subdf.copy()
final_df = list()
original_shapes = {}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    
    original_shape = subdf.shape
    # print(lang,model ,original_shape)
    if lang not in original_shapes:
        original_shapes[lang]=original_shape
        
    # print(lang, model, original_shape)
    filtered = subdf.copy()
    filtered = filtered.dropna(subset=["score_original", "score_flipped"])
    # rescale scores of LLMs from 0-100 to 0-1
    if filtered.iloc[0]["type"] == "LLM":
        filtered["score_original"] = filtered["score_original"] / 100
        filtered["score_flipped"] = filtered["score_flipped"] / 100
    
        # filter out all rows that have scores outside the range [0, 1]
        filtered = filtered.loc[
            (filtered["score_original"] > 0.1) & \
            (filtered["score_original"] <= 1) & \
            (filtered["score_flipped"] > 0.1) & \
            (filtered["score_flipped"] <= 1)
        ]
    elif "MetricX" in filtered.iloc[0]["model"]:
        filtered["score_original"] = 1 - filtered["score_original"] / 25
        filtered["score_flipped"] = 1 - filtered["score_flipped"] / 25
    else:
        #no filtering needed for classical neural metrics!
        pass
    final_df.append(filtered)

data_all = pd.concat(final_df).copy()


# Compute the intersection of rows to be kept for each language
keep_rows_inter = {}
for lang, lang_df in data_all.groupby("lang"):
    common_rows = set(lang_df.index)
    for model, model_df in lang_df.groupby("model"):
        common_rows.intersection_update(set(model_df.index))
    keep_rows_inter[lang] = list(common_rows)


for lang in keep_rows_inter:
    print(f"Language: {lang}  # {original_shapes[lang][0]}  samples before taking intersection of models. ")
    print(f"Language: {lang} kept # {len(keep_rows_inter[lang])}  samples (after intersection between all models). ")

# then we filter again!
final_df=[]
removed_info={lang:{} for lang in LANGS}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    filtered=subdf.copy()
    filtered = filtered.loc[keep_rows_inter[lang]]
    removed_info[lang][model] =  100 * (1 - filtered.shape[0] / original_shapes[lang][0])

    final_df.append(filtered)
data_all = pd.concat(final_df).copy()
data_all["score_M"] = data_all.apply(lambda x: x["score_original"] if x["src_gender"] == "male" else x["score_flipped"], axis=1)
data_all["score_F"] = data_all.apply(lambda x: x["score_original"] if x["src_gender"] == "female" else x["score_flipped"], axis=1)


#we remove the standard ctx models + all llms 
data_all = data_all[~data_all['model'].str.contains('original ctx', case=True, na=False)]
data_all = data_all[~data_all['type'].str.contains('LLM', case=True, na=False)]

data_all.head(3)


Language: ar  # 1100  samples before taking intersection of models. 
Language: ar kept # 904  samples (after intersection between all models). 
Language: de  # 1100  samples before taking intersection of models. 
Language: de kept # 913  samples (after intersection between all models). 
Language: es  # 1096  samples before taking intersection of models. 
Language: es kept # 1002  samples (after intersection between all models). 
Language: fr  # 1099  samples before taking intersection of models. 
Language: fr kept # 968  samples (after intersection between all models). 
Language: hi  # 1098  samples before taking intersection of models. 
Language: hi kept # 969  samples (after intersection between all models). 
Language: it  # 1094  samples before taking intersection of models. 
Language: it kept # 954  samples (after intersection between all models). 
Language: pt  # 1089  samples before taking intersection of models. 
Language: pt kept # 985  samples (after intersection between all m

Unnamed: 0,ID,context,source,reference_original,reference_flipped,score_original,score_flipped,model,lang,src_gender,type,score_M,score_F
2,2,"Slap-guitarist Miyavi, who has his roots in vi...",It was the most successful international tour ...,كانت من أنجح الرحلات الدولية التي نفذها فنان ي...,كانت من أنجح الرحلات الدولية التي نفذتها فنانة...,0.807939,0.806999,Kiwi 22--translated ctx--GT,ar,male,neural_metric,0.807939,0.806999
3,3,Whittier was heavily influenced by the doctrin...,Whittier was first introduced to poetry by a t...,تعرّف ويتري على الشعر لأول مرة عن طريق معلم.,تعرّفت ويتري على الشعر لأول مرة عن طريق معلم.,0.844404,0.827781,Kiwi 22--translated ctx--GT,ar,male,neural_metric,0.844404,0.827781
5,5,"He was later appointed as a Civil, Commercial ...",Guinle entered politics as Government Secretar...,دخل Guinle السياسة كسكرتير المحافظة لبلدية Com...,دخلت Guinle السياسة كسكرتير المحافظة لبلدية Co...,0.776866,0.764903,Kiwi 22--translated ctx--GT,ar,male,neural_metric,0.776866,0.764903


In [43]:
data_all.model.unique()

array(['Kiwi 22--translated ctx--GT',
       'Kiwi 22--translated ctx--NLLB 3.3B',
       'Kiwi 23 XL--translated ctx--GT',
       'Kiwi 23 XL--translated ctx--NLLB 3.3B',
       'Kiwi 23 XXL--translated ctx--GT',
       'Kiwi 23 XXL--translated ctx--NLLB 3.3B',
       'MetricX23 LARGE--translated ctx--GT',
       'MetricX23 LARGE--translated ctx--NLLB 3.3B',
       'MetricX23 XL--translated ctx--GT',
       'MetricX23 XL--translated ctx--NLLB 3.3B',
       'xCOMET XL--translated ctx--GT',
       'xCOMET XL--translated ctx--NLLB 3.3B',
       'xCOMET XXL--translated ctx--GT',
       'xCOMET XXL--translated ctx--NLLB 3.3B'], dtype=object)

In [44]:

acc_list2 = list()
for (model, lang), subdf in data_all.groupby(["model","lang"]):
    print(model,lang)
    acc = eval_metrics.Accuracy({"F": np.array(subdf["score_F"]), "M": np.array(subdf["score_M"]),"y_true":np.array(subdf["src_gender"])}).score()
    group_metrics = eval_metrics.GroupMetricsBootstraping({"F": np.array(subdf["score_F"]), "M": np.array(subdf["score_M"]),"y_true":np.array(subdf["src_gender"])},alternative="greater").stat_significance_with_paired_bootstrap()
    
    d = dict(
        model=model,
        lang=lang,
        acc_total = acc,
        error_rate_total = group_metrics["results"]["total_error_rate"],
        error_rate_male = group_metrics["results"]["male"]["fnr"],
        error_rate_fem = group_metrics["results"]["female"]["fnr"],
        error_rate_diff =    group_metrics["results"]["fnr_diff"],
        error_rate_ratio =  group_metrics["results"]["fnr_ratio"],
        stat_significance = group_metrics["stat_significance"],
        p_value = group_metrics["p_value"],
    )
    acc_list2.append(d)

acc_df2 = pd.DataFrame(acc_list2)



Kiwi 22--translated ctx--GT ar
Kiwi 22--translated ctx--GT de
Kiwi 22--translated ctx--GT es
Kiwi 22--translated ctx--GT fr
Kiwi 22--translated ctx--GT hi
Kiwi 22--translated ctx--GT it
Kiwi 22--translated ctx--GT pt
Kiwi 22--translated ctx--GT ru
Kiwi 22--translated ctx--NLLB 3.3B ar
Kiwi 22--translated ctx--NLLB 3.3B de
Kiwi 22--translated ctx--NLLB 3.3B es
Kiwi 22--translated ctx--NLLB 3.3B fr
Kiwi 22--translated ctx--NLLB 3.3B hi
Kiwi 22--translated ctx--NLLB 3.3B it
Kiwi 22--translated ctx--NLLB 3.3B pt
Kiwi 22--translated ctx--NLLB 3.3B ru
Kiwi 23 XL--translated ctx--GT ar
Kiwi 23 XL--translated ctx--GT de
Kiwi 23 XL--translated ctx--GT es
Kiwi 23 XL--translated ctx--GT fr
Kiwi 23 XL--translated ctx--GT hi
Kiwi 23 XL--translated ctx--GT it
Kiwi 23 XL--translated ctx--GT pt
Kiwi 23 XL--translated ctx--GT ru
Kiwi 23 XL--translated ctx--NLLB 3.3B ar
Kiwi 23 XL--translated ctx--NLLB 3.3B de
Kiwi 23 XL--translated ctx--NLLB 3.3B es
Kiwi 23 XL--translated ctx--NLLB 3.3B fr
Kiwi 23 XL--

In [45]:
os.makedirs('./results-copied/stats/nonambiguous-contextual/translated_ctx_comparison', exist_ok=True)
acc_df_path = './results-copied/stats/nonambiguous-contextual/translated_ctx_comparison/results.csv'
acc_df2.to_csv(acc_df_path, index=False)