In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from src import eval_metrics

sns.set_theme("paper", style="whitegrid")
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = "lightgray"
plt.rcParams["grid.linestyle"] = "--"
plt.rcParams["grid.linewidth"] = 0.5

plt.rc("text", usetex=False)

### Analysis on Reference Translations
This part of the notebook saves all the ratios, error rates for the original counterfactual subset, for all the examined metrics on the paper.


For the aggregated results over all languages + figures + tables check the make_figs_counterfactual.ipynb

In [11]:

MODELS = {
    "Kiwi 22": {
        "model_id": "Unbabel--wmt22-cometkiwi-da",
        "type": "neural_metric"
    },
    "Kiwi 23 XL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xl",
        "type": "neural_metric"
    },
    "Kiwi 23 XXL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xxl",
        "type": "neural_metric"
    },
    "xCOMET XL": {
        "model_id": "Unbabel--XCOMET-XL",
        "type": "neural_metric"
    },
    "xCOMET XXL": {
        "model_id": "Unbabel--XCOMET-XXL",
        "type": "neural_metric"
    },
    "MetricX23 LARGE": {
        "model_id": "google--metricx-23-qe-large-v2p0",
        "type": "neural_metric"
    },
    "MetricX23 XL": {
        "model_id": "google--metricx-23-qe-xl-v2p0",
        "type": "neural_metric"
    },

    "Llama 3.1 70B": {
        "model_id": "meta-llama--Meta-Llama-3.1-70B-Instruct",
        "type": "LLM"
    },
    
    "Mistral 7B": {
        "model_id": "mistralai--Mistral-7B-Instruct-v0.2",
        "type": "LLM"
    },
    "Gemma 2 9B": {
        "model_id": "google--gemma-2-9b-it",
        "type": "LLM"
    },
    "GPT 4": {
    "model_id": "gpt-4o-2024-05-13",
    "type": "LLM"
    }
    
}
           
LANGS = ["it","es","de","pt","ar","fr","hi","ru"]

tot = list()



for lang in LANGS:
    # open source gender info file
    for m, info in MODELS.items():
        res_ref = pd.read_csv(f"./results-copied/scores/nonambiguous-counterfactual/mtgeneval/references/{lang}/{info['model_id']}/scores.csv")
        res_ref["model"] = m
        res_ref["lang"] = lang
        res_ref["type"] = info["type"]
        ref_rename_dict = {"score_feminine_feminine":"ref_scores_ff","score_feminine_masculine":"ref_scores_fm","score_masculine_masculine":"ref_scores_mm","score_masculine_feminine":"ref_scores_mf",}
        res_ref = res_ref.rename(ref_rename_dict,axis=1).copy()
        tot.append(res_ref)
        
res = pd.concat(tot)
data_all = res

data_all.head(3)



Unnamed: 0,ID,source_feminine,reference_feminine,source_masculine,reference_masculine,ref_scores_ff,ref_scores_fm,ref_scores_mf,ref_scores_mm,model,lang,type
0,0,Shelton became one of Fort Worth's first femal...,Shelton divenne una delle prime sviluppatrici ...,Shelton became one of Fort Worth's first male ...,Shelton divenne uno dei primi sviluppatori imm...,0.866541,0.759603,0.695609,0.844132,Kiwi 22,it,neural_metric
1,1,"My mom played for Leinster, my aunt played for...","Mia madre, mia zia, mio cugina hanno giocato n...","My dad played for Leinster, my uncle played fo...","Mio padre, mio zio, mio cugino hanno giocato n...",0.871331,0.865184,0.863381,0.882993,Kiwi 22,it,neural_metric
2,2,"In 1512, as papal legate for Italy and Germany...","Nel 1512, come legato pontificio per l'Italia ...","In 1512, as papal legate for Italy and Germany...","Nel 1512, come legato pontificio per l'Italia ...",0.879873,0.817564,0.823342,0.881579,Kiwi 22,it,neural_metric


In [12]:

final_df = list()
original_shapes = {}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    
    original_shape = subdf.shape
    # print(lang,model ,original_shape)
    if lang not in original_shapes:
        original_shapes[lang]=original_shape
        
    # print(lang, model, original_shape)
    filtered = subdf.copy()
    filtered = filtered.dropna(subset=["ref_scores_ff", "ref_scores_fm","ref_scores_mf","ref_scores_mm"])
   
    # rescale scores of LLMs from 0-100 to 0-1
    if filtered.iloc[0]["type"] == "LLM":
        filtered["ref_scores_ff"] = filtered["ref_scores_ff"] / 100
        filtered["ref_scores_fm"] = filtered["ref_scores_fm"] / 100
        filtered["ref_scores_mm"] = filtered["ref_scores_mm"] / 100
        filtered["ref_scores_mf"] = filtered["ref_scores_mf"] / 100
    
        # filter out all rows that have scores outside the range [0, 1]
        filtered = filtered.loc[
            (filtered["ref_scores_ff"] > 0.1) & \
            (filtered["ref_scores_ff"] <= 1) & \
            (filtered["ref_scores_fm"] > 0.1) & \
            (filtered["ref_scores_fm"] <= 1) & \
            (filtered["ref_scores_mm"] > 0.1) & \
            (filtered["ref_scores_mm"] <= 1) & \
            (filtered["ref_scores_mf"] > 0.1) & \
            (filtered["ref_scores_mf"] <= 1)
        ]
    elif "MetricX" in filtered.iloc[0]["model"]:
        filtered["ref_scores_ff"] = 1 - filtered["ref_scores_ff"] / 25
        filtered["ref_scores_fm"] = 1 - filtered["ref_scores_fm"] / 25
        filtered["ref_scores_mm"] = 1 - filtered["ref_scores_mm"] / 25
        filtered["ref_scores_mf"] = 1 - filtered["ref_scores_mf"] / 25
    else:
        #no filtering needed for classical neural metrics!
        pass
    final_df.append(filtered)

data_all = pd.concat(final_df).copy()



# Compute the intersection of rows to be kept for each language
keep_rows_inter = {}
for lang, lang_df in data_all.groupby("lang"):
    common_rows = set(lang_df.index)
    for model, model_df in lang_df.groupby("model"):
        common_rows.intersection_update(set(model_df.index))
    keep_rows_inter[lang] = list(common_rows)

for lang in keep_rows_inter:
    print(f"Language: {lang}  # {original_shapes[lang][0]}  samples before taking intersection of models. ")
    print(f"Language: {lang} kept # {len(keep_rows_inter[lang])}  samples (after intersection between all models). ")


# then we filter again!
final_df=[]
removed_info={lang:{} for lang in LANGS}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    filtered=subdf.copy()
    filtered = filtered.loc[keep_rows_inter[lang]]
    removed_info[lang][model] =  100 * (1 - filtered.shape[0] / original_shapes[lang][0])

    final_df.append(filtered)
data_all = pd.concat(final_df).copy()


Language: ar  # 300  samples before taking intersection of models. 
Language: ar kept # 229  samples (after intersection between all models). 
Language: de  # 300  samples before taking intersection of models. 
Language: de kept # 222  samples (after intersection between all models). 
Language: es  # 300  samples before taking intersection of models. 
Language: es kept # 240  samples (after intersection between all models). 
Language: fr  # 300  samples before taking intersection of models. 
Language: fr kept # 255  samples (after intersection between all models). 
Language: hi  # 300  samples before taking intersection of models. 
Language: hi kept # 271  samples (after intersection between all models). 
Language: it  # 300  samples before taking intersection of models. 
Language: it kept # 243  samples (after intersection between all models). 
Language: pt  # 300  samples before taking intersection of models. 
Language: pt kept # 221  samples (after intersection between all models). 

In [13]:
removed_info = pd.DataFrame(removed_info)
removed_info

Unnamed: 0,it,es,de,pt,ar,fr,hi,ru
GPT 4,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Gemma 2 9B,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Kiwi 22,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Kiwi 23 XL,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Kiwi 23 XXL,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Llama 3.1 70B,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
MetricX23 LARGE,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
MetricX23 XL,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
Mistral 7B,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333
xCOMET XL,19.0,20.0,26.0,26.333333,23.666667,15.0,9.666667,21.333333


Measuring Ties 

In [16]:
ties = []
for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    F_scores = np.array(pd.concat((subdf["ref_scores_ff"],subdf["ref_scores_mf"]),axis=0))
    M_scores = np.array(pd.concat((subdf["ref_scores_fm"],subdf["ref_scores_mm"]),axis=0))
    ties.append({"lang": lang, "model": model,"type":subdf.iloc[0]["type"],"ties": 100*(F_scores==M_scores).sum()/len(F_scores)})
ties = pd.DataFrame(ties)
ties_aggr = pd.pivot_table(ties, 
                             index='model',  # Pivot by 'full_name'
                             values=['ties'], 
                             aggfunc=['mean','std'])
ties_aggr

Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,ties,ties
model,Unnamed: 1_level_2,Unnamed: 2_level_2
GPT 4,13.750863,7.220292
Gemma 2 9B,25.84597,10.795979
Kiwi 22,2.955738,4.414397
Kiwi 23 XL,2.786005,4.1444
Kiwi 23 XXL,2.9755,4.512605
Llama 3.1 70B,29.601846,9.800344
MetricX23 LARGE,3.292546,4.98241
MetricX23 XL,4.604977,5.915331
Mistral 7B,73.142166,10.665658
xCOMET XL,3.377789,4.444919


#### Analysis using continous metrics :
  - QE (s_F,h_F) / QE (s_M,h_M)   (mean and stds)
  - statistical significance of the ratio (using 1sample ratio test)

In [17]:


stats_list = list()
for (model, lang), subdf in data_all.groupby(["model","lang"]):

    ##  FF-MM comparison with continuous metrics! REFERENCES!!!
    REF_ratios,_,REF_ratio_1sampl_test = eval_metrics.Ratio({"F": subdf["ref_scores_ff"], "M": subdf["ref_scores_mm"]}).score()
    # REF_diff,_,REF_diff_wilc_test = metrics.DifferenceWilcoxon({"F": subdf["ref_scores_ff"], "M": subdf["ref_scores_mm"]}).score()
    print(model,lang,subdf.shape)
    
    d = dict(
        model=model,
        lang=lang,
        ref_score_M_mean=subdf["ref_scores_mm"].mean(),
        ref_score_F_mean=subdf["ref_scores_ff"].mean(),
        ref_score_M_std=subdf["ref_scores_mm"].std(),
        ref_score_F_std=subdf["ref_scores_ff"].std(),
        ref_ratio_mean = REF_ratios.mean(),
        ref_ratio_std = REF_ratios.std(),
        ref_significance_ratio = REF_ratio_1sampl_test  
    )
    stats_list.append(d)

stats_df = pd.DataFrame(stats_list).reset_index()
stats_df = stats_df.drop('index',axis=1)

# check for None values:
print(stats_df.isnull().any())


GPT 4 ar (229, 12)
GPT 4 de (222, 12)
GPT 4 es (240, 12)
GPT 4 fr (255, 12)
GPT 4 hi (271, 12)
GPT 4 it (243, 12)
GPT 4 pt (221, 12)
GPT 4 ru (236, 12)
Gemma 2 9B ar (229, 12)
Gemma 2 9B de (222, 12)
Gemma 2 9B es (240, 12)
Gemma 2 9B fr (255, 12)
Gemma 2 9B hi (271, 12)
Gemma 2 9B it (243, 12)
Gemma 2 9B pt (221, 12)
Gemma 2 9B ru (236, 12)
Kiwi 22 ar (229, 12)
Kiwi 22 de (222, 12)
Kiwi 22 es (240, 12)
Kiwi 22 fr (255, 12)
Kiwi 22 hi (271, 12)
Kiwi 22 it (243, 12)
Kiwi 22 pt (221, 12)
Kiwi 22 ru (236, 12)
Kiwi 23 XL ar (229, 12)
Kiwi 23 XL de (222, 12)
Kiwi 23 XL es (240, 12)
Kiwi 23 XL fr (255, 12)
Kiwi 23 XL hi (271, 12)
Kiwi 23 XL it (243, 12)
Kiwi 23 XL pt (221, 12)
Kiwi 23 XL ru (236, 12)
Kiwi 23 XXL ar (229, 12)
Kiwi 23 XXL de (222, 12)
Kiwi 23 XXL es (240, 12)
Kiwi 23 XXL fr (255, 12)
Kiwi 23 XXL hi (271, 12)
Kiwi 23 XXL it (243, 12)
Kiwi 23 XXL pt (221, 12)
Kiwi 23 XXL ru (236, 12)
Llama 3.1 70B ar (229, 12)
Llama 3.1 70B de (222, 12)
Llama 3.1 70B es (240, 12)
Llama 3.1 70B f

Save the corresponding metrics computed + statistical significance!

In [19]:
### save the corresponding metrics computed for all metrics
stats_path = './results-copied/stats/nonambiguous-counterfactual/references/continuous-analysis/results.csv'
os.makedirs(os.path.dirname(stats_path), exist_ok=True)
stats_df.to_csv(stats_path, index=False)

ratio_test_path= './results-copied/stats/nonambiguous-counterfactual/references/continuous-analysis/significance_test_ratio.csv'
os.makedirs(os.path.dirname(ratio_test_path), exist_ok=True)
significance_test_ratio_QE = stats_df.pivot_table(index=["model"],columns=["lang"],
                              values=["ref_significance_ratio"])
significance_test_ratio_QE.to_csv(ratio_test_path, index=False)



### Prediction-based Analysis
In this part we compute the results for the prediction-based analysis (per language) :
- Total Error Rate
- ER(S^F) , ER(S^M)  
- Ratio Φ = ER(S^F) / ER(S^M)
- Statistical Significance for ER(S^F) / ER(S^M) (per language)  (Using bootstrap resampling )

In [20]:
data_all.columns

Index(['ID', 'source_feminine', 'reference_feminine', 'source_masculine',
       'reference_masculine', 'ref_scores_ff', 'ref_scores_fm',
       'ref_scores_mf', 'ref_scores_mm', 'model', 'lang', 'type'],
      dtype='object')

In [21]:

prediction_list = list()
for (model, lang), subdf in data_all.groupby(["model","lang"]):
    print(lang,model,subdf.shape)
    F_scores = np.array(pd.concat((subdf["ref_scores_ff"],subdf["ref_scores_mf"]),axis=0))
    M_scores = np.array(pd.concat((subdf["ref_scores_fm"],subdf["ref_scores_mm"]),axis=0))
    ground_truth = np.concatenate((np.array(["female" for i in range(subdf["ref_scores_ff"].shape[0])]), np.array(["male" for i in range(subdf["ref_scores_mm"].shape[0])])),axis=0)
    ref_acc = eval_metrics.Accuracy({"F": F_scores, "M": M_scores,"y_true": ground_truth}).score()
    ref_group_metrics = eval_metrics.GroupMetricsBootstraping({"F": F_scores, "M": M_scores,"y_true": ground_truth},alternative="greater").stat_significance_with_paired_bootstrap()

    d = dict(
        model=model,
        lang=lang,
        ref_acc_total = ref_acc,
        ref_error_rate_total = ref_group_metrics["results"]["total_error_rate"],
        ref_error_rate_male = ref_group_metrics["results"]["male"]["fnr"],
        ref_error_rate_fem =ref_group_metrics["results"]["female"]["fnr"],
        ref_error_rate_ratio =  ref_group_metrics["results"]["fnr_ratio"],
        ref_stat_significance = ref_group_metrics["stat_significance"],
    )

    prediction_list.append(d)
prediction_df = pd.DataFrame(prediction_list)



ar GPT 4 (229, 12)
de GPT 4 (222, 12)
es GPT 4 (240, 12)
fr GPT 4 (255, 12)
hi GPT 4 (271, 12)
it GPT 4 (243, 12)
pt GPT 4 (221, 12)
ru GPT 4 (236, 12)
ar Gemma 2 9B (229, 12)
de Gemma 2 9B (222, 12)
es Gemma 2 9B (240, 12)
fr Gemma 2 9B (255, 12)
hi Gemma 2 9B (271, 12)
it Gemma 2 9B (243, 12)
pt Gemma 2 9B (221, 12)
ru Gemma 2 9B (236, 12)
ar Kiwi 22 (229, 12)
de Kiwi 22 (222, 12)
es Kiwi 22 (240, 12)
fr Kiwi 22 (255, 12)
hi Kiwi 22 (271, 12)
it Kiwi 22 (243, 12)
pt Kiwi 22 (221, 12)
ru Kiwi 22 (236, 12)
ar Kiwi 23 XL (229, 12)
de Kiwi 23 XL (222, 12)
es Kiwi 23 XL (240, 12)
fr Kiwi 23 XL (255, 12)
hi Kiwi 23 XL (271, 12)
it Kiwi 23 XL (243, 12)
pt Kiwi 23 XL (221, 12)
ru Kiwi 23 XL (236, 12)
ar Kiwi 23 XXL (229, 12)
de Kiwi 23 XXL (222, 12)
es Kiwi 23 XXL (240, 12)
fr Kiwi 23 XXL (255, 12)
hi Kiwi 23 XXL (271, 12)
it Kiwi 23 XXL (243, 12)
pt Kiwi 23 XXL (221, 12)
ru Kiwi 23 XXL (236, 12)
ar Llama 3.1 70B (229, 12)
de Llama 3.1 70B (222, 12)
es Llama 3.1 70B (240, 12)
fr Llama 3.1 70

check for None values

In [22]:
prediction_df.isnull().any()

model                    False
lang                     False
ref_acc_total            False
ref_error_rate_total     False
ref_error_rate_male      False
ref_error_rate_fem       False
ref_error_rate_ratio     False
ref_stat_significance    False
dtype: bool

We save all the corresponding metrics computed (per-language).

In [23]:
acc_df_path = './results-copied/stats/nonambiguous-counterfactual/references/prediction-analysis/data.csv'
os.makedirs(os.path.dirname(acc_df_path), exist_ok=True)
prediction_df.to_csv(acc_df_path, index=False)
