#### This notebook performs the analysis of the paper on GT translations (counterfactual subset) section 5.2
- Just generates and saves the results of the analysis (Error rates, Error rate ratios, QE ratios, statistical significance etc)
- Figures and Tables of the paper are generated using these results with notebook: {}

In [1]:
import os

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import  wilcoxon
from pprint import pprint
import numpy as np
from src import eval_metrics

sns.set_theme("paper", style="whitegrid")
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = "lightgray"
plt.rcParams["grid.linestyle"] = "--"
plt.rcParams["grid.linewidth"] = 0.5

plt.rc("text", usetex=False)



In [2]:
# https://github.com/amazon-science/machine-translation-gender-eval/blob/main/accuracy_metric.py
def get_words(line):
    """
    Helper function to get the set of words in a line.

    :param line: Line from which to get the words.
    :return: Set of words in the line.
    """
    return set(line.strip().split())


def get_trg_correct_incorrect(trg_line, orig_ref, ctf_ref):
    """
    Compute overlap between hyperences and translation
    We first get unique words in each of the references w.r.t each other then we compute their overlap with target
    """ 
    # get words for each segment
    trg_words, orig_words, ctf_words = get_words(trg_line), get_words(orig_ref), get_words(ctf_ref)
    # get unique words in each of the references
    orig_unique = orig_words - ctf_words
    ctf_unique = ctf_words - orig_words
    # now check the words in the target sentence for overlap with incorrect unique words
    trg_correct = trg_words & orig_unique 
    trg_incorrect = trg_words & ctf_unique
    return trg_correct, trg_incorrect 


def gender_decision(trg_line, orig_ref, ctf_ref):
    """
    Check if gender of a sentence is correct based on corresponding correct and incorrect references.
    Algorithm: We make decision based on whether hyp overlaps with original ref and counterfactual ref

    :param trg_line: Sentence from translation output for which to check gender.
    :param orig_ref: Original (Correct) reference.
    :param ctf_ref: Counterfactual reference.
    :return: a list of decision, overlap(hyp, original ref), overlap(hyp, counterfactual ref)
    """
    trg_correct, trg_incorrect = get_trg_correct_incorrect(trg_line, orig_ref, ctf_ref)

    if trg_incorrect:
        decision = 'Incorrect'
    else:
        decision = 'Correct'

    return [decision, trg_correct, trg_incorrect]


# def gender_decision_modified(trg_line, orig_ref, ctf_ref):
#     """
#     Check if gender of a sentence is correct based on corresponding correct and incorrect references.
#     Algorithm: We make decision based on whether hyp overlaps with original ref and counterfactual ref

#     :param trg_line: Sentence from translation output for which to check gender.
#     :param orig_ref: Original (Correct) reference.
#     :param ctf_ref: Counterfactual reference.
#     :return: a list of decision, overlap(hyp, original ref), overlap(hyp, counterfactual ref)
#     """
#     trg_correct, trg_incorrect = get_trg_correct_incorrect(trg_line, orig_ref, ctf_ref)

#     if trg_incorrect:
#         decision = "Incorrect"
#     else:
#         if trg_correct:
#             decision = "Correct"
#         else:
#             decision = "None"

#     return decision

In [9]:

MODELS = {
    "Kiwi 22": {
        "model_id": "Unbabel--wmt22-cometkiwi-da",
        "type": "neural_metric"
    },
    "Kiwi 23 XL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xl",
        "type": "neural_metric"
    },
    "Kiwi 23 XXL": {
        "model_id": "Unbabel--wmt23-cometkiwi-da-xxl",
        "type": "neural_metric"
    },
    "xCOMET XL": {
        "model_id": "Unbabel--XCOMET-XL",
        "type": "neural_metric"
    },
    "xCOMET XXL": {
        "model_id": "Unbabel--XCOMET-XXL",
        "type": "neural_metric"
    },
    "MetricX23 LARGE": {
        "model_id": "google--metricx-23-qe-large-v2p0",
        "type": "neural_metric"
    },
    "MetricX23 XL": {
        "model_id": "google--metricx-23-qe-xl-v2p0",
        "type": "neural_metric"
    },
}
           
LANGS = ["de","ar","it","es","pt","fr","hi","ru"]

tot = list()
res_hyp=[]


for lang in LANGS:
    # open source gender info file
    for m, info in MODELS.items():

        #first we load the GT translations + the references (these files also contain bleuscores + quality -- these are ignored)
        res_hyp=[]
        res_hyp = pd.read_csv(f"./results-copied/scores/nonambiguous-counterfactual/mtgeneval/gt-translations-augmented/{lang}/{info['model_id']}/scores.csv")
        res_hyp["decision_f"] = res_hyp.apply(lambda x: gender_decision(x['translate_feminine'],
                                                                                  x['reference_feminine'],
                                                                                  x['reference_masculine'])[0], axis=1)
        res_hyp["decision_m"] = res_hyp.apply(lambda x: gender_decision(x['translate_masculine'],
                                                                                  x['reference_masculine'],
                                                                                  x['reference_feminine'])[0], axis=1)
        res_hyp["model"] = m
        res_hyp["lang"] = lang
        res_hyp["type"] = info["type"]
        hyp_rename_dict = {"score_feminine_feminine":"hyp_scores_ff",
                            "score_feminine_masculine":"hyp_scores_fm",
                            "score_masculine_masculine":"hyp_scores_mm",
                            "score_masculine_feminine":"hyp_scores_mf"}
        res_hyp = res_hyp.rename(hyp_rename_dict,axis=1)
        tot.append(res_hyp)
        

res = pd.concat(tot)
data_all = res
print(data_all.columns)


Index(['ID', 'source_feminine', 'translate_feminine', 'reference_feminine',
       'source_masculine', 'translate_masculine', 'reference_masculine',
       'hyp_scores_ff', 'hyp_scores_fm', 'hyp_scores_mf', 'hyp_scores_mm',
       'scores_f_bleu', 'scores_m_bleu', 'f_quality_bins', 'm_quality_bins',
       'decision_f', 'decision_m', 'model', 'lang', 'type'],
      dtype='object')


In [10]:

final_df = list()
original_shapes = {}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    
    original_shape = subdf.shape
    # print(lang,model ,original_shape)
    if lang not in original_shapes:
        original_shapes[lang]=original_shape
        
    # print(lang, model, original_shape)
    filtered = subdf.copy()
    filtered = filtered.dropna(subset=["hyp_scores_ff", "hyp_scores_fm","hyp_scores_mf","hyp_scores_mm"])
   
    # rescale scores of LLMs from 0-100 to 0-1
    if filtered.iloc[0]["type"] == "LLM":
        filtered["hyp_scores_ff"] = filtered["hyp_scores_ff"] / 100
        filtered["hyp_scores_fm"] = filtered["hyp_scores_fm"] / 100
        filtered["hyp_scores_mm"] = filtered["hyp_scores_mm"] / 100
        filtered["hyp_scores_mf"] = filtered["hyp_scores_mf"] / 100
    
        # filter out all rows that have scores outside the range [0, 1]
        filtered = filtered.loc[
            (filtered["hyp_scores_ff"] > 0.1) & \
            (filtered["hyp_scores_ff"] <= 1) & \
            (filtered["hyp_scores_fm"] > 0.1) & \
            (filtered["hyp_scores_fm"] <= 1) & \
            (filtered["hyp_scores_mm"] > 0.1) & \
            (filtered["hyp_scores_mm"] <= 1) & \
            (filtered["hyp_scores_mf"] > 0.1) & \
            (filtered["hyp_scores_mf"] <= 1)
        ]
    elif "MetricX" in filtered.iloc[0]["model"]:
        filtered["hyp_scores_ff"] = 1 - filtered["hyp_scores_ff"] / 25
        filtered["hyp_scores_fm"] = 1 - filtered["hyp_scores_fm"] / 25
        filtered["hyp_scores_mm"] = 1 - filtered["hyp_scores_mm"] / 25
        filtered["hyp_scores_mf"] = 1 - filtered["hyp_scores_mf"] / 25
    else:
        #no filtering needed for classical neural metrics!
        pass
    final_df.append(filtered)

data_all = pd.concat(final_df).copy()



# Compute the intersection of rows to be kept for each language
keep_rows_inter = {}
for lang, lang_df in data_all.groupby("lang"):
    common_rows = set(lang_df.index)
    for model, model_df in lang_df.groupby("model"):
        common_rows.intersection_update(set(model_df.index))
    keep_rows_inter[lang] = list(common_rows)


for lang in keep_rows_inter:
    print(f"Language: {lang}  # {original_shapes[lang][0]}  samples before taking intersection of models. ")
    print(f"Language: {lang} kept # {len(keep_rows_inter[lang])}  samples (after intersection between all models). ")
    
# then we filter again!
final_df=[]
removed_info={lang:{} for lang in LANGS}

for (lang, model), subdf in data_all.groupby(["lang", "model"]):
    filtered=subdf.copy()
    filtered = filtered.loc[keep_rows_inter[lang]]
    removed_info[lang][model] =  100 * (1 - filtered.shape[0] / original_shapes[lang][0])

    final_df.append(filtered)
data_all = pd.concat(final_df).copy()

Language: ar  # 300  samples before taking intersection of models. 
Language: ar kept # 300  samples (after intersection between all models). 
Language: de  # 300  samples before taking intersection of models. 
Language: de kept # 300  samples (after intersection between all models). 
Language: es  # 300  samples before taking intersection of models. 
Language: es kept # 300  samples (after intersection between all models). 
Language: fr  # 300  samples before taking intersection of models. 
Language: fr kept # 300  samples (after intersection between all models). 
Language: hi  # 300  samples before taking intersection of models. 
Language: hi kept # 300  samples (after intersection between all models). 
Language: it  # 300  samples before taking intersection of models. 
Language: it kept # 300  samples (after intersection between all models). 
Language: pt  # 300  samples before taking intersection of models. 
Language: pt kept # 300  samples (after intersection between all models). 

In [11]:
removed_info = pd.DataFrame(removed_info)
removed_info

Unnamed: 0,de,ar,it,es,pt,fr,hi,ru
Kiwi 22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kiwi 23 XL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kiwi 23 XXL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MetricX23 LARGE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MetricX23 XL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xCOMET XL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xCOMET XXL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the data filtering according to received scores, we do a second filtering based on the gender accuracy and quality of GT translations. This involves:

- Gender Accuracy (step 1): we verify that both the masculine and feminine GT translations are correctly inflected in terms of gender by comparing them with the corresponding references. We keep samples that have both gender forms correctly inflected.

- Quality Match (step 2): We verify that both the masculine and feminine GT translations are of same quality. We use the BLEU lexical overlap based metric to measure the quality of each translation (compute both BLEU(GT_f,h_f) and BLEU(GT_m,h_m))  .  Then we split into quality buckets (according to the obtained BLEU scores)  and we keep the paired samples (s_f,GT_f) and (s_m,GT_m) that (a)  fall into the same quality bucket according to BLEU: BLEU(GT_f,h_f) ~ BLEU(GT_m,h_m) , and (b) the mean bleu scores of the bucket are not significantly different (using wilcoxon test). 


<!-- The following cell runs for each lang/QE model language pair and takes into account the samples that:
- have the same gender for both translations
- are of the same quality (quality bucket)
- the mean bleu scores of the bucket are not stat different (ensure good quality for both groups) (according to Wilcoxon Test!)
- drops quality buckets with very few samples (less than 10)!


We store two dfs:
- One with instances the results of the analysis on each quality bucket (quality_bucket_results_df)
- and another one (data_df) , that does has the all instances before the analysis (but filtered as explained above). This is in case we want to run an overall analysis for each QE model/lp combination.  -->

In [12]:
buckets_with_small_sizes = []
drop_buckets_bleu = [] #store buckets which have significant stat difference in bleu!

keep_instances = []

for (lang,model),subdf in data_all.groupby(['lang','model']):
    quality_buckets_kept = []
    gender_corr = subdf[(subdf['decision_f']=="Correct") & (subdf['decision_m']=="Correct")]
    samples_same_qual = gender_corr[gender_corr["f_quality_bins"] == gender_corr["m_quality_bins"]]

    # we check that the bleu scores in each quality bucket dont show any significance difference!
    # if they do we throw this samples out!
    for quality,qual_df in samples_same_qual.groupby(['f_quality_bins']):
        
        if qual_df.shape[0]<10:
            print("Very small samples size for:", lang,model,quality)
            buckets_with_small_sizes.append([lang,model,quality])
            continue
        s, p_value = wilcoxon(qual_df['scores_f_bleu'], qual_df['scores_m_bleu'],alternative='two-sided')
        if p_value<0.05:
            drop_buckets_bleu.append(lang,model,quality)
        else:
            quality_buckets_kept.append(qual_df)
            print("Quality Bucket kept:  ",lang,model,quality,qual_df.shape)
            
    merge_quality_buckets = pd.concat(quality_buckets_kept)   
    keep_instances.append(merge_quality_buckets) 

filtered_df = pd.concat(keep_instances)

Quality Bucket kept:   ar Kiwi 22 ('Excellent',) (40, 20)
Quality Bucket kept:   ar Kiwi 22 ('Fair',) (32, 20)
Quality Bucket kept:   ar Kiwi 22 ('Good',) (20, 20)
Quality Bucket kept:   ar Kiwi 22 ('Poor',) (27, 20)
Quality Bucket kept:   ar Kiwi 22 ('Useless',) (18, 20)
Quality Bucket kept:   ar Kiwi 22 ('Very good',) (17, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Excellent',) (40, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Fair',) (32, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Good',) (20, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Poor',) (27, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Useless',) (18, 20)
Quality Bucket kept:   ar Kiwi 23 XL ('Very good',) (17, 20)
Quality Bucket kept:   ar Kiwi 23 XXL ('Excellent',) (40, 20)
Quality Bucket kept:   ar Kiwi 23 XXL ('Fair',) (32, 20)
Quality Bucket kept:   ar Kiwi 23 XXL ('Good',) (20, 20)
Quality Bucket kept:   ar Kiwi 23 XXL ('Poor',) (27, 20)
Quality Bucket kept:   ar Kiwi 23 XXL ('Useless',) (18, 20)
Quality Bucket kept:   ar K

After this filtering process: we compute for each model-language (by assessing the GT translations):
- QE ratio denoted as QE_{GT}(s_f,GT_f) / QE_{GT}(s_m,GT_m)  (continuous-analysis results)  + Statistical Significance using 1sample test
- The total error rate ER_{GT} (prediction-analysis results) 
- The error rate ratio between the two gender groups denoted as Φ_{GT} = ER_{GT}(S^F) / ER_{GT}(S^M) (prediction-analysis results) + Statistical Significance using Bootstrap resampling




In [13]:
continous_results = []
prediction_results = []
for (lang,model),subdf in filtered_df.groupby(['lang','model']):
    print(model,lang,subdf.shape)

    #Continuous results -- Compute QE ratio
    HYP_qe_ratios,_,HYP_qe_ratios_1sampl_test = eval_metrics.Ratio({"F": subdf["hyp_scores_ff"], "M": subdf["hyp_scores_mm"]}).score()
    d1 = dict(
        model=model,
        lang=lang,
        gt_score_M_mean=subdf["hyp_scores_mm"].mean(),
        gt_score_F_mean=subdf["hyp_scores_ff"].mean(),
        gt_score_M_std=subdf["hyp_scores_mm"].std(),
        gt_score_F_std=subdf["hyp_scores_ff"].std(),
        gt_ratio_mean = HYP_qe_ratios.mean(),
        gt_ratio_std = HYP_qe_ratios.std(),
        gt_significance_ratio = HYP_qe_ratios_1sampl_test 
    )
    continous_results.append(d1)

    #hard-prediction-based results -- Compute Error Rate and Error Rate Ratio

    F_scores = np.array(pd.concat((subdf["hyp_scores_ff"],subdf["hyp_scores_mf"]),axis=0))
    M_scores = np.array(pd.concat((subdf["hyp_scores_fm"],subdf["hyp_scores_mm"]),axis=0))
    ground_truth = np.concatenate((np.array(["female" for i in range(subdf["hyp_scores_ff"].shape[0])]), np.array(["male" for i in range(subdf["hyp_scores_mm"].shape[0])])),axis=0)
    acc = eval_metrics.Accuracy({"F": F_scores, "M": M_scores,"y_true": ground_truth}).score()
    hyp_group_metrics = eval_metrics.GroupMetricsBootstraping({"F": F_scores, "M": M_scores,"y_true": ground_truth},alternative="greater").stat_significance_with_paired_bootstrap()

    d2 = dict(
        model=model,
        lang=lang,
        gt_acc_total = acc,
        gt_error_rate_total = hyp_group_metrics["results"]["total_error_rate"],
        gt_error_rate_male = hyp_group_metrics["results"]["male"]["fnr"],
        gt_error_rate_fem =hyp_group_metrics["results"]["female"]["fnr"],
        gt_error_rate_ratio =  hyp_group_metrics["results"]["fnr_ratio"],
        gt_stat_significance = hyp_group_metrics["stat_significance"],
    )

    prediction_results.append(d2)

continuous_df = pd.DataFrame(continous_results).reset_index()

prediction_df = pd.DataFrame(prediction_results).reset_index()



Kiwi 22 ar (154, 20)
Kiwi 23 XL ar (154, 20)
Kiwi 23 XXL ar (154, 20)
MetricX23 LARGE ar (154, 20)
MetricX23 XL ar (154, 20)
xCOMET XL ar (154, 20)
xCOMET XXL ar (154, 20)
Kiwi 22 de (153, 20)
Kiwi 23 XL de (153, 20)
Kiwi 23 XXL de (153, 20)
MetricX23 LARGE de (153, 20)
MetricX23 XL de (153, 20)
xCOMET XL de (153, 20)
xCOMET XXL de (153, 20)
Kiwi 22 es (146, 20)
Kiwi 23 XL es (146, 20)
Kiwi 23 XXL es (146, 20)
MetricX23 LARGE es (146, 20)
MetricX23 XL es (146, 20)
xCOMET XL es (146, 20)
xCOMET XXL es (146, 20)
Kiwi 22 fr (148, 20)
Kiwi 23 XL fr (148, 20)
Kiwi 23 XXL fr (148, 20)
MetricX23 LARGE fr (148, 20)
MetricX23 XL fr (148, 20)
xCOMET XL fr (148, 20)
xCOMET XXL fr (148, 20)
Kiwi 22 hi (129, 20)
Kiwi 23 XL hi (129, 20)
Kiwi 23 XXL hi (129, 20)
MetricX23 LARGE hi (129, 20)
MetricX23 XL hi (129, 20)
xCOMET XL hi (129, 20)
xCOMET XXL hi (129, 20)
Kiwi 22 it (147, 20)
Kiwi 23 XL it (147, 20)
Kiwi 23 XXL it (147, 20)
MetricX23 LARGE it (147, 20)
MetricX23 XL it (147, 20)
xCOMET XL it (1

In [14]:
# check for None values:
continuous_df.isnull().any()

index                    False
model                    False
lang                     False
gt_score_M_mean          False
gt_score_F_mean          False
gt_score_M_std           False
gt_score_F_std           False
gt_ratio_mean            False
gt_ratio_std             False
gt_significance_ratio    False
dtype: bool

In [15]:
prediction_df.isnull().any()

index                   False
model                   False
lang                    False
gt_acc_total            False
gt_error_rate_total     False
gt_error_rate_male      False
gt_error_rate_fem       False
gt_error_rate_ratio     False
gt_stat_significance    False
dtype: bool

In [17]:
continuous_path = './results-copied/stats/nonambiguous-counterfactual/gt-translations/continuous-analysis/results.csv'
os.makedirs(os.path.dirname(continuous_path), exist_ok=True)
continuous_df.to_csv(continuous_path, index=False)

In [18]:
prediction_path = './results-copied/stats/nonambiguous-counterfactual/gt-translations/prediction-analysis/results.csv'
os.makedirs(os.path.dirname(prediction_path), exist_ok=True)
prediction_df.to_csv(prediction_path, index=False)