In [50]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme("paper", style="whitegrid", font="serif", font_scale=2.4)
mpl.rcParams["axes.grid"] = True
mpl.rcParams["grid.color"] = "lightgray"
mpl.rcParams["grid.linestyle"] = "--"
mpl.rcParams["grid.linewidth"] = 0.5
plt.rc("text", usetex=False)

### Counterfactual dataset -- References

#### Tables for continuous metrics QE ratio

Results are computed with analysis_mt_geneval_counterfactual_all_models.ipynb and shown below. 

- QE(s_f,h_f) / QE(s_m,h_m)   aggregated over  languages
- QE(s_f,h_f) / QE(s_m,h_m)     statistical test (1sample) per language

In [24]:
res_overall_analysis_path = './results-copied/stats/nonambiguous-counterfactual/references/continuous-analysis/results.csv'
results_df_cont = pd.read_csv(res_overall_analysis_path)

def create_significance_table_QE_ratio(df):
    # Extract unique models and languages from the DataFrame
    models = df['model'].unique()
    languages = ['de', 'es', 'fr', 'it', 'pt', 'ru', 'hi', 'ar']

    # Define the LaTeX table header
    table_header = r"""
    \begin{table*}
    \centering
    \small
    \begin{tabular}{l""" + "c" * len(languages) + r"""}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{""" + str(len(languages)) + r"""}{c}{$QE(s_F,h_F) / QE(s_M,h_M)$ } \\
    & """ + " & ".join(languages) + r""" \\
    \midrule
    """

    # Generate table rows for each model
    table_rows = ""
    for model in models:
        # Get data for the current model
        model_data = df[df['model'] == model].set_index('lang')

        # Construct the row with ratio first and significance in parentheses
        row = model + " & " + " & ".join(
            [
                f"{model_data.loc[lang, 'ref_ratio_mean']:.3f} " +
                (r"\gabad" if model_data.loc[lang, 'stat_significance'] else r"\gagood") + ""
                if lang in model_data.index else "!"
                for lang in languages
            ]
        ) + r" \\" + "\n"
        
        # Add spacing after certain models (optional)
        if model == models[2] or model == models[4]:
            row += r"\addlinespace[0.2cm]" + "\n"
        
        table_rows += row

    # Define the LaTeX table footer
    table_footer = r"""
    \bottomrule
    \end{tabular}
    \caption{\textbf{MT-GenEval Counterfactual Results} on human-written translations. 
    A green check mark (\cmark) indicates that correctly-inflected feminine references receive statistically significantly (p<0.05) lower scores than correctly-inflected masculine references. Numbers represent the ratio mean, and significance results are shown in parentheses.} 
    \label{tab:contextual_stat_tests}
    \end{table*}
    """

    # Combine header, rows, and footer
    latex_table = table_header + table_rows + table_footer
    
    return latex_table



ref_stat_sign_copy = results_df_cont[['model','lang','ref_significance_ratio','ref_ratio_mean']].copy()
ref_stat_sign_copy = ref_stat_sign_copy.rename({'ref_significance_ratio':'stat_significance'},axis=1).copy()
stat_table_latex = create_significance_table_QE_ratio(ref_stat_sign_copy)
print(stat_table_latex)
# print(stat_table_latex)


    \begin{table*}
    \centering
    \small
    \begin{tabular}{lcccccccc}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{8}{c}{$QE(s_F,h_F) / QE(s_M,h_M)$ } \\
    & de & es & fr & it & pt & ru & hi & ar \\
    \midrule
    GPT 4 & 0.999 \gagood & 1.003 \gagood & 0.998 \gagood & 1.003 \gagood & 1.000 \gagood & 0.993 \gagood & 1.000 \gagood & 1.007 \gagood \\
Gemma 2 9B & 0.997 \gagood & 1.001 \gagood & 1.002 \gagood & 1.004 \gagood & 0.999 \gagood & 0.998 \gagood & 1.003 \gagood & 0.999 \gagood \\
Kiwi 22 & 0.995 \gabad & 0.995 \gabad & 0.995 \gabad & 0.996 \gabad & 1.000 \gagood & 0.997 \gabad & 0.998 \gabad & 1.000 \gagood \\
\addlinespace[0.2cm]
Kiwi 23 XL & 0.997 \gagood & 1.000 \gagood & 0.998 \gagood & 0.996 \gagood & 0.998 \gagood & 1.000 \gagood & 1.006 \gagood & 1.015 \gagood \\
Kiwi 23 XXL & 1.005 \gagood & 0.995 \gagood & 1.006 \gagood & 0.991 \gabad & 0.999 \gagood & 1.001 \gagood & 1.006 \gagood & 1.025 \gagood \\
\addlinespace[0.2cm]
Llama 3.1 70

Table 1 of paper QE ratio

In [25]:
results_df_cont_aggr = results_df_cont.pivot_table(index='model',values=['ref_ratio_mean'])
results_df_cont_aggr.round(3)

Unnamed: 0_level_0,ref_ratio_mean
model,Unnamed: 1_level_1
GPT 4,1.001
Gemma 2 9B,1.0
Kiwi 22,0.997
Kiwi 23 XL,1.001
Kiwi 23 XXL,1.003
Llama 3.1 70B,0.995
MetricX23 LARGE,0.998
MetricX23 XL,1.0
Mistral 7B,1.0
xCOMET XL,0.988


#### Tables for prediction-analysis
Results are computed with analysis_mt_geneval_counterfactual_all_models.ipynb and shown below. 

- Total Error Rate  aggregated over languages
- Ratio Φ = ER(S^F) / ER(S^M)  aggregated over languages
- Statistical significance test of Φ  per language (bootstrap resampling)

In [26]:
res_overall_analysis_path = './results-copied/stats/nonambiguous-counterfactual/references/prediction-analysis/data.csv'
results_df = pd.read_csv(res_overall_analysis_path)
errors_df = results_df[["model","lang","ref_error_rate_total",
                                      "ref_error_rate_male",
                                      "ref_error_rate_fem",
                                      "ref_error_rate_ratio",
                                      "ref_stat_significance",
                                      ]]

Table 1 of paper ER_R and Φ_R

In [27]:
errors_df_aggr = errors_df.pivot_table(index='model',values=["ref_error_rate_total",'ref_error_rate_ratio'])
errors_df_aggr.round(2)

Unnamed: 0_level_0,ref_error_rate_ratio,ref_error_rate_total
model,Unnamed: 1_level_1,Unnamed: 2_level_1
GPT 4,1.15,0.16
Gemma 2 9B,1.36,0.28
Kiwi 22,1.7,0.11
Kiwi 23 XL,1.18,0.09
Kiwi 23 XXL,0.87,0.07
Llama 3.1 70B,1.16,0.31
MetricX23 LARGE,1.25,0.31
MetricX23 XL,1.19,0.12
Mistral 7B,1.13,0.74
xCOMET XL,1.81,0.1


In [28]:
def create_significance_table_Phi_ratiov2(df):
    # Extract unique models and languages from the DataFrame
    models = df['model'].unique()
    languages = ['de', 'es', 'fr', 'it', 'pt', 'ru', 'hi', 'ar']  # Custom language order

    # Define the LaTeX table header
    table_header = r"""
    \begin{table*}
    \centering
    \small
    \begin{tabular}{l""" + "c" * len(languages) + r"""}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{""" + str(len(languages)) + r"""}{c}{$\Phi(s_F,s_M)$} \\
    & """ + " & ".join(languages) + r""" \\
    \midrule
    """

    # Generate table rows for each model
    table_rows = ""
    for model in models:
        # Get data for the current model
        model_data = df[df['model'] == model].set_index('lang')

        # Construct the row with values and significance
        row = model + " & " + " & ".join(
            [
                f"{model_data.loc[lang, 'ref_error_rate_ratio']:.3f} " +
                (r"\gabad" if model_data.loc[lang, 'stat_significance'] else r"\gagood") + ""
                if lang in model_data.index else "!"
                for lang in languages
            ]
        ) + r" \\" + "\n"

        # Add spacing after certain models (optional)
        if model == models[2] or model == models[4]:
            row += r"\addlinespace[0.2cm]" + "\n"

        table_rows += row

    # Define the LaTeX table footer
    table_footer = r"""
    \bottomrule
    \end{tabular}
    \caption{\textbf{MT-GenEval Counterfactual Results} on human-written translations. 
    A green check mark (\cmark) indicates that metrics make statistically significantly (p<0.05) more errors on sources with feminine referents compared to their masculine counterparts. } 
    \label{tab:contextual_stat_tests}
    \end{table*}
    """

    # Combine header, rows, and footer
    latex_table = table_header + table_rows + table_footer
    
    return latex_table



ref_stat_sign_copy = errors_df[['model','lang','ref_stat_significance','ref_error_rate_ratio']].copy()
ref_stat_sign_copy = ref_stat_sign_copy.rename({'ref_stat_significance':'stat_significance'},axis=1).copy()
stat_table_latex = create_significance_table_Phi_ratiov2(ref_stat_sign_copy)
print(stat_table_latex)


    \begin{table*}
    \centering
    \small
    \begin{tabular}{lcccccccc}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{8}{c}{$\Phi(s_F,s_M)$} \\
    & de & es & fr & it & pt & ru & hi & ar \\
    \midrule
    GPT 4 & 0.857 \gagood & 1.200 \gabad & 0.939 \gagood & 1.167 \gabad & 1.037 \gabad & 1.368 \gabad & 1.214 \gabad & 1.400 \gabad \\
Gemma 2 9B & 0.775 \gagood & 1.317 \gabad & 1.767 \gabad & 1.443 \gabad & 1.459 \gabad & 1.421 \gabad & 1.248 \gabad & 1.426 \gabad \\
Kiwi 22 & 2.500 \gabad & 1.909 \gabad & 2.333 \gabad & 1.773 \gabad & 0.938 \gagood & 2.500 \gabad & 0.683 \gagood & 1.000 \gagood \\
\addlinespace[0.2cm]
Kiwi 23 XL & 1.143 \gabad & 1.136 \gabad & 0.500 \gagood & 1.000 \gagood & 1.333 \gabad & 2.333 \gabad & 1.297 \gabad & 0.706 \gagood \\
Kiwi 23 XXL & 0.286 \gagood & 0.923 \gagood & 0.750 \gagood & 1.036 \gabad & 0.692 \gagood & 2.000 \gabad & 1.000 \gagood & 0.273 \gagood \\
\addlinespace[0.2cm]
Llama 3.1 70B & 0.902 \gagood & 1.265 \gaba

Instead of taking the average of ratios Φ over languages, I present here another method that is more robust to outliers. This basically aggregates error rates over languages ER(S^F) 
and ER(S^M) and the computes the ratio of averages.

In [57]:
# res_overall_analysis_path = os.path.join(Root_path,'stats/counterfactual/prediction-analysis/data.csv')
# results_df = pd.read_csv(res_overall_analysis_path)
# errors_df = results_df[["model","lang","ref_error_rate_total",
#                                       "ref_error_rate_male",
#                                       "ref_error_rate_fem",
#                                       ]]
# errors_df_aggr = errors_df.pivot_table(index='model',values=["ref_error_rate_total",
#                                       "ref_error_rate_male",
#                                       "ref_error_rate_fem"])
# errors_df_aggr["ref_error_ratio"] = errors_df_aggr["ref_error_rate_fem"]/errors_df_aggr["ref_error_rate_male"]
# errors_df_aggr

## Counterfactual Automatic Translations Section (GT translations)

In [32]:
res_overall_analysis_path = './results-copied/stats/nonambiguous-counterfactual/gt-translations/continuous-analysis/results.csv'
results_df_cont = pd.read_csv(res_overall_analysis_path)


def create_significance_table_QE_ratio(df):
    # Extract unique models and languages from the DataFrame
    models = df['model'].unique()
    languages = ['de', 'es', 'fr', 'it', 'pt', 'ru', 'hi', 'ar']

    # Define the LaTeX table header
    table_header = r"""
    \begin{table*}
    \centering
    \small
    \begin{tabular}{l""" + "c" * len(languages) + r"""}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{""" + str(len(languages)) + r"""}{c}{$QE(s_F,h_F) / QE(s_M,h_M)$ } \\
    & """ + " & ".join(languages) + r""" \\
    \midrule
    """

    # Generate table rows for each model
    table_rows = ""
    for model in models:
        # Get data for the current model
        model_data = df[df['model'] == model].set_index('lang')

        # Construct the row with ratio first and significance in parentheses
        row = model + " & " + " & ".join(
            [
                f"{model_data.loc[lang, 'qe_ratio_mean']:.3f} " +
                (r"\gabad" if model_data.loc[lang, 'stat_significance'] else r"\gagood") + ""
                if lang in model_data.index else "!"
                for lang in languages
            ]
        ) + r" \\" + "\n"
        
        # Add spacing after certain models (optional)
        if model == models[2] or model == models[4]:
            row += r"\addlinespace[0.2cm]" + "\n"
        
        table_rows += row

    # Define the LaTeX table footer
    table_footer = r"""
    \bottomrule
    \end{tabular}
    \caption{\textbf{MT-GenEval Counterfactual Results} on human-written translations. 
    A green check mark (\cmark) indicates that correctly-inflected feminine references receive statistically significantly (p<0.05) lower scores than correctly-inflected masculine references. Numbers represent the ratio mean, and significance results are shown in parentheses.} 
    \label{tab:contextual_stat_tests}
    \end{table*}
    """

    # Combine header, rows, and footer
    latex_table = table_header + table_rows + table_footer
    
    return latex_table


GT_stat_sign_copy = results_df_cont[['model','lang','gt_significance_ratio','gt_ratio_mean']].copy()
GT_stat_sign_copy = GT_stat_sign_copy.rename({'gt_significance_ratio':'stat_significance','gt_ratio_mean':'qe_ratio_mean'},axis=1).copy()
stat_table_latex = create_significance_table_QE_ratio(GT_stat_sign_copy)
print(stat_table_latex)



    \begin{table*}
    \centering
    \small
    \begin{tabular}{lcccccccc}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{8}{c}{$QE(s_F,h_F) / QE(s_M,h_M)$ } \\
    & de & es & fr & it & pt & ru & hi & ar \\
    \midrule
    Kiwi 22 & 0.998 \gagood & 0.997 \gabad & 0.996 \gabad & 0.996 \gabad & 0.998 \gagood & 0.996 \gagood & 0.997 \gabad & 0.997 \gagood \\
Kiwi 23 XL & 0.999 \gagood & 0.998 \gagood & 0.996 \gagood & 1.011 \gagood & 1.004 \gagood & 0.990 \gabad & 1.006 \gagood & 0.993 \gagood \\
Kiwi 23 XXL & 1.003 \gagood & 0.996 \gagood & 1.024 \gagood & 1.000 \gagood & 1.013 \gagood & 0.994 \gagood & 0.996 \gagood & 0.996 \gagood \\
\addlinespace[0.2cm]
MetricX23 LARGE & 1.000 \gagood & 1.000 \gagood & 0.998 \gabad & 0.998 \gabad & 0.999 \gagood & 0.999 \gagood & 1.000 \gagood & 0.992 \gabad \\
MetricX23 XL & 1.002 \gagood & 1.001 \gagood & 1.000 \gagood & 0.999 \gagood & 0.997 \gagood & 1.004 \gagood & 1.003 \gagood & 1.001 \gagood \\
\addlinespace[0.2cm]
x

Aggregated QE ratios for GT translations

In [33]:
results_df_cont_aggr = results_df_cont.pivot_table(index='model',values=['gt_ratio_mean'])
results_df_cont_aggr.round(3)

Unnamed: 0_level_0,gt_ratio_mean
model,Unnamed: 1_level_1
Kiwi 22,0.997
Kiwi 23 XL,1.0
Kiwi 23 XXL,1.003
MetricX23 LARGE,0.999
MetricX23 XL,1.001
xCOMET XL,0.99
xCOMET XXL,0.997


Table for prediction based analysis

In [34]:
res_overall_analysis_path = './results-copied/stats/nonambiguous-counterfactual/gt-translations/prediction-analysis/results.csv'
results_df = pd.read_csv(res_overall_analysis_path)
errors_df = results_df[["model","lang","gt_error_rate_total",
                                      "gt_error_rate_male",
                                      "gt_error_rate_fem",
                                      "gt_error_rate_ratio",
                                      "gt_stat_significance",
                                      ]]

Aggregated results over languages

In [35]:
errors_df_aggr = errors_df.pivot_table(index='model',values=["gt_error_rate_total",'gt_error_rate_ratio'])
errors_df_aggr.round(2)

Unnamed: 0_level_0,gt_error_rate_ratio,gt_error_rate_total
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Kiwi 22,1.34,0.13
Kiwi 23 XL,1.6,0.11
Kiwi 23 XXL,1.62,0.09
MetricX23 LARGE,1.17,0.34
MetricX23 XL,1.33,0.14
xCOMET XL,1.96,0.12
xCOMET XXL,1.23,0.1


In [36]:
def create_significance_table_Phi_ratiov2(df):
    # Extract unique models and languages from the DataFrame
    models = df['model'].unique()
    languages = ['de', 'es', 'fr', 'it', 'pt', 'ru', 'hi', 'ar']  # Custom language order

    # Define the LaTeX table header
    table_header = r"""
    \begin{table*}
    \centering
    \small
    \begin{tabular}{l""" + "c" * len(languages) + r"""}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{""" + str(len(languages)) + r"""}{c}{$\Phi(s_F,s_M)$} \\
    & """ + " & ".join(languages) + r""" \\
    \midrule
    """

    # Generate table rows for each model
    table_rows = ""
    for model in models:
        # Get data for the current model
        model_data = df[df['model'] == model].set_index('lang')

        # Construct the row with values and significance
        row = model + " & " + " & ".join(
            [
                f"{model_data.loc[lang, 'error_rate_ratio']:.3f} " +
                (r"\gabad" if model_data.loc[lang, 'stat_significance'] else r"\gagood") + ""
                if lang in model_data.index else "!"
                for lang in languages
            ]
        ) + r" \\" + "\n"

        # Add spacing after certain models (optional)
        if model == models[2] or model == models[4]:
            row += r"\addlinespace[0.2cm]" + "\n"

        table_rows += row

    # Define the LaTeX table footer
    table_footer = r"""
    \bottomrule
    \end{tabular}
    \caption{\textbf{MT-GenEval Counterfactual Results} on human-written translations. 
    A green check mark (\cmark) indicates that metrics make statistically significantly (p<0.05) more errors on sources with feminine referents compared to their masculine counterparts. } 
    \label{tab:contextual_stat_tests}
    \end{table*}
    """

    # Combine header, rows, and footer
    latex_table = table_header + table_rows + table_footer
    
    return latex_table



gt_stat_sign_copy = errors_df[['model','lang','gt_stat_significance','gt_error_rate_ratio']].copy()
gt_stat_sign_copy = gt_stat_sign_copy.rename({'gt_stat_significance':'stat_significance','gt_error_rate_ratio':'error_rate_ratio'},axis=1).copy()
stat_table_latex = create_significance_table_Phi_ratiov2(gt_stat_sign_copy)
print(stat_table_latex)


    \begin{table*}
    \centering
    \small
    \begin{tabular}{lcccccccc}
    \toprule
    \rowcolor{white}
    \textbf{Metrics} & \multicolumn{8}{c}{$\Phi(s_F,s_M)$} \\
    & de & es & fr & it & pt & ru & hi & ar \\
    \midrule
    Kiwi 22 & 0.571 \gagood & 1.269 \gabad & 2.000 \gabad & 1.292 \gabad & 1.100 \gabad & 2.333 \gabad & 0.526 \gagood & 1.667 \gabad \\
Kiwi 23 XL & 1.600 \gabad & 1.143 \gabad & 0.833 \gagood & 1.160 \gabad & 1.000 \gagood & 3.667 \gabad & 1.053 \gabad & 2.333 \gabad \\
Kiwi 23 XXL & 0.600 \gagood & 1.125 \gabad & 4.000 \gabad & 1.000 \gagood & 0.667 \gagood & 1.500 \gabad & 1.030 \gabad & 3.000 \gabad \\
\addlinespace[0.2cm]
MetricX23 LARGE & 1.086 \gabad & 1.216 \gabad & 1.115 \gabad & 1.208 \gabad & 0.973 \gagood & 1.290 \gabad & 0.966 \gagood & 1.474 \gabad \\
MetricX23 XL & 1.429 \gabad & 1.207 \gabad & 1.286 \gabad & 1.125 \gabad & 1.400 \gabad & 2.143 \gabad & 1.071 \gabad & 1.000 \gagood \\
\addlinespace[0.2cm]
xCOMET XL & 2.000 \gabad & 1.238 \ga