In [1]:
import pandas as pd
from tqdm import tqdm
import json
from scipy import stats
import itertools
import numpy as np
import sacrebleu

In [2]:
# https://huggingface.co/datasets/nikitam/ACES

'source': a string containing the text that needs to be translated     
'good-translation': possible translation of the source sentence     
'incorrect-translation': translation of the source sentence that contains an error or phenomenon of interest     
'reference': the gold standard translation     
'phenomena': the type of error or phenomena being studied in the example     
'langpair': the source language and the target language pair of the example     
Note that the good-translation may not be free of errors but it is a better translation than the incorrect-translation     

In [2]:
home_path = '/home/glushkovato/robustness'

path = home_path + '/robust_MT_evaluation/data/test/robustness/aces.jsonl'
aces = pd.read_json(path, lines=True)
aces.columns = ['source', 'good_translation', 'incorrect_translation', 'reference', 'phenomena', 'langpair']
aces.iloc[:3]

Unnamed: 0,source,good_translation,incorrect_translation,reference,phenomena,langpair
0,Proper nutritional practices alone cannot gene...,Las prácticas nutricionales adecuadas por sí s...,Las prácticas nutricionales adecuadas por sí s...,No es posible que las prácticas nutricionales ...,addition,en-es
1,"has geographic variations, where the age limi...","tiene variaciones geográficas, donde el límite...","La definición tiene variaciones geográficas, d...","varía geográficamente, donde el límite de eda...",addition,en-es
2,The U.N. also hopes to finalize a fund to help...,La ONU también espera finalizar un fondo para ...,La ONU también espera finalizar un fondo para ...,La ONU tiene la intención de completar un fond...,addition,en-es


In [3]:
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_source.txt", "w") as f:
    for i in aces.source.tolist():
        print(i, file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_good_translation.txt", "w") as f:
    for i in aces.good_translation.tolist():
        print(i, file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_bad_translation.txt", "w") as f:
    for i in aces.incorrect_translation.tolist():
        print(i, file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_reference.txt", "w") as f:
    for i in aces.reference.tolist():
        print(i, file=f)
        
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_phenomena.txt", "w") as f:
    for i in aces.phenomena.tolist():
        print(i, file=f)
        
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_lp.txt", "w") as f:
    for i in aces.langpair.tolist():
        print(i, file=f)

In [4]:
def read_json(path):
    f = open (path, "r")
    data = json.loads(f.read())
    k = list(data.keys())[0]

    src = []
    mt = []
    ref = []
    COMET_score = []

    for i in data[k]:
        src.append(i['src'])
        mt.append(i['mt'])
        ref.append(i['ref'])
        COMET_score.append(float(i['COMET']))

    f.close()
    
    df = pd.DataFrame(data=np.array([src, mt, ref, COMET_score]).T, 
                      columns=['src', 'mt', 'ref', 'comet'])
    
    return df

In [5]:
def compute_kendall_tau_like(comet_good, comet_bad):
    '''
    Compute correlation as Kendall Tau-like scores.
    '''
    concordant = (comet_good > comet_bad).sum()
    discordant = (comet_good <= comet_bad).sum()
    t = (concordant - discordant)/(concordant + discordant)
    return np.round(t, 3)

In [6]:
versions = ['24e1', '25e1', '29e1', '83e1']
# 24 comet
# 25 comet + aug
# 29 comet + sl-feats
# 83 comet + word-level 

# read bleu and chrf
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_scores_bleu.txt", "r") as f:
    good_bleu = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_scores_bleu.txt", "r") as f:
    bad_bleu = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_scores_chrf.txt", "r") as f:
    good_chrf = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_scores_chrf.txt", "r") as f:
    bad_chrf = [float(i) for i in f]

In [7]:
aces['bleu_good'] = good_bleu
aces['bleu_bad'] = bad_bleu

aces['chrf_good'] = good_chrf
aces['chrf_bad'] = bad_chrf

In [8]:
path_good = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_output_v24e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_output_v24e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
aces['comet_good_v24e1'] = good_tmp.comet.tolist()
aces['comet_bad_v24e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_output_v25e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_output_v25e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
aces['comet_good_v25e1'] = good_tmp.comet.tolist()
aces['comet_bad_v25e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_output_v29e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_output_v29e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
aces['comet_good_v29e1'] = good_tmp.comet.tolist()
aces['comet_bad_v29e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_output_v83e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_output_v83e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
aces['comet_good_v83e1'] = good_tmp.comet.tolist()
aces['comet_bad_v83e1'] = bad_tmp.comet.tolist()

In [9]:
def norm(x):
    mean = np.mean(x)
    std = np.std(x)
    x = (x - mean)/std
    return np.nan_to_num(x)

In [10]:
def compute_norm(x):
    mean = np.mean(x)
    std = np.std(x)
    return [mean, std]

def apply_norm(mean, std, x):
    xn = (np.array(x) - mean)/std
    return np.array(xn)


bleu_mean = 28.759837809513634
bleu_std = 18.47107097319373
chrf_mean = 58.992697061544284
chrf_std = 14.286372518233168
comet_mean = 0.46782439675103793
comet_std = 0.37521584265953595

In [11]:
aces.iloc[:3]

Unnamed: 0,source,good_translation,incorrect_translation,reference,phenomena,langpair,bleu_good,bleu_bad,chrf_good,chrf_bad,comet_good_v24e1,comet_bad_v24e1,comet_good_v25e1,comet_bad_v25e1,comet_good_v29e1,comet_bad_v29e1,comet_good_v83e1,comet_bad_v83e1
0,Proper nutritional practices alone cannot gene...,Las prácticas nutricionales adecuadas por sí s...,Las prácticas nutricionales adecuadas por sí s...,No es posible que las prácticas nutricionales ...,addition,en-es,27.991048,21.331701,67.615189,66.2577,0.8760790824890137,0.8629890084266663,0.7771773338317871,0.7535963654518127,0.9341334700584412,0.9143527150154114,0.6153641939163208,0.6119881272315979
1,"has geographic variations, where the age limi...","tiene variaciones geográficas, donde el límite...","La definición tiene variaciones geográficas, d...","varía geográficamente, donde el límite de eda...",addition,en-es,53.705729,49.124158,66.61339,65.662548,0.754996657371521,0.7610759735107422,0.7810795903205872,0.7246838212013245,0.8212878108024597,0.7988062500953674,0.6024278402328491,0.5628837943077087
2,The U.N. also hopes to finalize a fund to help...,La ONU también espera finalizar un fondo para ...,La ONU también espera finalizar un fondo para ...,La ONU tiene la intención de completar un fond...,addition,en-es,4.57719,4.625869,37.642203,37.675617,0.7052345275878906,0.7368098497390747,0.6307772994041443,0.6097211241722107,0.5013346076011658,0.5139285326004028,0.261641263961792,0.2939286231994629


In [12]:
# ensemble scores

# best weights based on kendall - computed over mqm 2021
a = 0.02512562814070352
b = 0.04522613065326633
c = 0.9296482412060302

aces['ensemble_good'] = np.mean([a*apply_norm(bleu_mean, bleu_std, aces.bleu_good.tolist()),
                                   b*apply_norm(chrf_mean, chrf_std, aces.chrf_good.tolist()), 
                                   c*apply_norm(comet_mean, comet_std, aces.comet_good_v24e1.astype('float').tolist())], axis=0)

aces['ensemble_bad'] = np.mean([a*apply_norm(bleu_mean, bleu_std, aces.bleu_bad.tolist()),
                                  b*apply_norm(chrf_mean, chrf_std, aces.chrf_bad.tolist()),
                                  c*apply_norm(comet_mean, comet_std, aces.comet_bad_v24e1.astype('float').tolist())], axis=0)

In [13]:
PHENOMENA_MAPPING = {'addition': 'addition',
                     'omission': 'omission',
                     'ambiguous-translation-wrong-discourse-connective-since-causal': 'mistranslation',
                     'ambiguous-translation-wrong-discourse-connective-since-temporal': 'mistranslation',
                     'ambiguous-translation-wrong-discourse-connective-while-contrast': 'mistranslation',
                     'ambiguous-translation-wrong-discourse-connective-while-temporal': 'mistranslation',
                     'ambiguous-translation-wrong-gender-female-anti': 'mistranslation',
                     'ambiguous-translation-wrong-gender-female-pro': 'mistranslation',
                     'ambiguous-translation-wrong-gender-male-anti': 'mistranslation',
                     'ambiguous-translation-wrong-gender-male-pro': 'mistranslation',
                     'ambiguous-translation-wrong-sense-frequent': 'mistranslation',
                     'ambiguous-translation-wrong-sense-infrequent': 'mistranslation',
                     'anaphoric_group_it-they:deletion': 'mistranslation',
                     'anaphoric_group_it-they:substitution': 'mistranslation',
                     'anaphoric_intra_non-subject_it:deletion': 'mistranslation',
                     'anaphoric_intra_non-subject_it:substitution': 'mistranslation',
                     'anaphoric_intra_subject_it:deletion': 'mistranslation',
                     'anaphoric_intra_subject_it:substitution': 'mistranslation',
                     'anaphoric_intra_they:deletion': 'mistranslation',
                     'anaphoric_intra_they:substitution': 'mistranslation',
                     'anaphoric_singular_they:deletion': 'mistranslation',
                     'anaphoric_singular_they:substitution': 'mistranslation',
                     'coreference-based-on-commonsense': 'mistranslation',
                     'hallucination-date-time': 'mistranslation',
                     'hallucination-named-entity-level-1': 'mistranslation',
                     'hallucination-named-entity-level-2': 'mistranslation',
                     'hallucination-named-entity-level-3': 'mistranslation',
                     'hallucination-number-level-1': 'mistranslation',
                     'hallucination-number-level-2': 'mistranslation',
                     'hallucination-number-level-3': 'mistranslation',
                     'hallucination-real-data-vs-ref-word': 'mistranslation',
                     'hallucination-real-data-vs-synonym': 'mistranslation',
                     'hallucination-unit-conversion-amount-matches-ref': 'mistranslation',
                     'hallucination-unit-conversion-unit-matches-ref': 'mistranslation',
                     'lexical-overlap': 'mistranslation',
                     'modal_verb:deletion': 'mistranslation',
                     'modal_verb:substitution': 'mistranslation',
                     'nonsense': 'mistranslation',
                     'ordering-mismatch': 'mistranslation',
                     'overly-literal-vs-correct-idiom': 'mistranslation',
                     'overly-literal-vs-explanation': 'mistranslation',
                     'overly-literal-vs-ref-word': 'mistranslation',
                     'overly-literal-vs-synonym': 'mistranslation',
                     'pleonastic_it:deletion': 'mistranslation',
                     'pleonastic_it:substitution': 'mistranslation',
                     'xnli-addition-contradiction': 'mistranslation',
                     'xnli-addition-neutral': 'mistranslation',
                     'xnli-omission-contradiction': 'mistranslation',
                     'xnli-omission-neutral': 'mistranslation',
                     'copy-source': 'untranslated',
                     'untranslated-vs-ref-word': 'untranslated',
                     'untranslated-vs-synonym': 'untranslated',
                     'do-not-translate': 'do not translate',
                     'hyponym-replacement': 'overtranslation',
                     'hypernym-replacement': 'undertranslation',
                     'antonym-replacement': 'real-world knowledge',
                     'commonsense-only-ref-ambiguous': 'real-world knowledge',
                     'commonsense-src-and-ref-ambiguous': 'real-world knowledge',
                     'real-world-knowledge-entailment': 'real-world knowledge',
                     'real-world-knowledge-hypernym-vs-distractor': 'real-world knowledge',
                     'real-world-knowledge-hypernym-vs-hyponym': 'real-world knowledge',
                     'real-world-knowledge-synonym-vs-antonym': 'real-world knowledge',
                     'similar-language-high': 'wrong language',
                     'similar-language-low': 'wrong language',
                     'punctuation:deletion_all': 'punctuation',
                     'punctuation:deletion_commas': 'punctuation',
                     'punctuation:deletion_quotes': 'punctuation',
                     'punctuation:statement-to-question': 'punctuation'
                    }

In [14]:
aces['phenomena_top'] = aces.phenomena.apply(lambda x: PHENOMENA_MAPPING[x])

In [15]:
aces[['phenomena', 'phenomena_top']]

Unnamed: 0,phenomena,phenomena_top
0,addition,addition
1,addition,addition
2,addition,addition
3,addition,addition
4,addition,addition
...,...,...
36471,xnli-omission-neutral,mistranslation
36472,xnli-omission-neutral,mistranslation
36473,xnli-omission-neutral,mistranslation
36474,xnli-omission-neutral,mistranslation


In [16]:
aces['phenomena_top'].unique()

array(['addition', 'mistranslation', 'omission', 'wrong language',
       'real-world knowledge', 'untranslated', 'undertranslation',
       'overtranslation', 'do not translate', 'punctuation'], dtype=object)

In [18]:
ppp = aces['phenomena_top'].unique()

bleu_ph_scores = []
chrf_ph_scores = []
comet_v24e1_ph_scores = []
ensemble_ph_scores = []
comet_v25e1_ph_scores = []
comet_v29e1_ph_scores = []
comet_v83e1_ph_scores = []

for ph in ppp:
    aces_ph = aces[aces.phenomena_top == ph]
    kt_bleu = compute_kendall_tau_like(aces_ph.bleu_good, aces_ph.bleu_bad)
    kt_chrf = compute_kendall_tau_like(aces_ph.chrf_good, aces_ph.chrf_bad)
    kt_comet_v24e1 = compute_kendall_tau_like(aces_ph.comet_good_v24e1, aces_ph.comet_bad_v24e1)
    kt_ensemble = compute_kendall_tau_like(aces_ph.ensemble_good, aces_ph.ensemble_bad)
    kt_comet_v25e1 = compute_kendall_tau_like(aces_ph.comet_good_v25e1, aces_ph.comet_bad_v25e1)
    kt_comet_v29e1 = compute_kendall_tau_like(aces_ph.comet_good_v29e1, aces_ph.comet_bad_v29e1)
    kt_comet_v83e1 = compute_kendall_tau_like(aces_ph.comet_good_v83e1, aces_ph.comet_bad_v83e1)
    
    bleu_ph_scores.append(kt_bleu)
    chrf_ph_scores.append(kt_chrf)
    comet_v24e1_ph_scores.append(kt_comet_v24e1)
    ensemble_ph_scores.append(kt_ensemble)
    comet_v25e1_ph_scores.append(kt_comet_v25e1)
    comet_v29e1_ph_scores.append(kt_comet_v29e1)
    comet_v83e1_ph_scores.append(kt_comet_v83e1)
    
    print(ph)
    print(kt_bleu, kt_chrf, kt_comet_v24e1, kt_ensemble, kt_comet_v25e1, kt_comet_v29e1, kt_comet_v83e1) 
    print()

addition
0.748 0.644 0.349 0.367 0.52 0.443 0.427

mistranslation
-0.296 0.027 0.186 0.216 0.255 0.148 0.189

omission
0.427 0.784 0.704 0.828 0.706 0.724 0.666

wrong language
0.659 0.693 0.071 0.052 0.159 0.185 0.087

real-world knowledge
-0.906 -0.307 0.195 0.176 0.202 0.109 0.162

untranslated
0.786 0.928 0.709 0.894 0.58 0.618 0.686

undertranslation
-0.856 -0.592 0.08 -0.044 0.2 -0.18 0.12

overtranslation
-0.838 -0.696 0.27 0.176 0.308 0.086 0.304

do not translate
0.58 0.96 0.88 0.9 0.78 0.9 0.84

punctuation
0.658 0.803 0.328 0.699 0.377 0.323 0.339



In [19]:
aces.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/aces_with_preds.csv', index=None)

In [20]:
bleu_ph_scores

[0.748, -0.296, 0.427, 0.659, -0.906, 0.786, -0.856, -0.838, 0.58, 0.658]

In [22]:
ppp_weights = [5, 5, 5, 1, 1, 1, 5, 5, 1, 0.1]

aces_bleu = np.round(np.sum(np.array(ppp_weights)*np.array(bleu_ph_scores)), 3)
aces_chrf = np.round(np.sum(np.array(ppp_weights)*np.array(chrf_ph_scores)), 3)
aces_comet_v24e1 = np.round(np.sum(np.array(ppp_weights)*np.array(comet_v24e1_ph_scores)), 3)
aces_ensemble = np.round(np.sum(np.array(ppp_weights)*np.array(ensemble_ph_scores)), 3)
aces_comet_v25e1 = np.round(np.sum(np.array(ppp_weights)*np.array(comet_v25e1_ph_scores)), 3)
aces_comet_v29e1 = np.round(np.sum(np.array(ppp_weights)*np.array(comet_v29e1_ph_scores)), 3)
aces_comet_v83e1 = np.round(np.sum(np.array(ppp_weights)*np.array(comet_v83e1_ph_scores)), 3)

aces_bleu, aces_chrf, aces_comet_v24e1, aces_ensemble, aces_comet_v25e1, aces_comet_v29e1, aces_comet_v83e1

(-2.89, 3.189, 9.833, 9.807, 11.704, 7.949, 10.339)

In [23]:
all_lps = aces.langpair.unique()

enxx_lps = [l for l in all_lps if 'en-' in l]
xxen_lps = [l for l in all_lps if '-en' in l]
xxyy_lps = [l for l in all_lps if 'en' not in l]

In [24]:
def flag(x):
    if x in enxx_lps:
        return 'enxx'
    elif x in xxen_lps:
        return 'xxen'
    else:
        return 'xxyy'

aces['lp_flag'] = aces.langpair.apply(lambda x: flag(x))

In [25]:
all_t_bleu = []
all_t_chrf = []
all_t_v24e1 = []
all_t_v25e1 = []
all_t_v29e1 = []
all_t_v83e1 = []
all_t_ensemble = []

flags = ['enxx', 'xxen', 'xxyy']
for lp in flags:
    print('lp: ', lp)
    df_lp = aces[aces.lp_flag == lp]
    t_ensemble = compute_kendall_tau_like(df_lp.ensemble_good, df_lp.ensemble_bad)
    t_bleu = compute_kendall_tau_like(df_lp.bleu_good, df_lp.bleu_bad)
    t_chrf = compute_kendall_tau_like(df_lp.chrf_good, df_lp.chrf_bad)
    t_v24e1 = compute_kendall_tau_like(df_lp.comet_good_v24e1, df_lp.comet_bad_v24e1)
    t_v25e1 = compute_kendall_tau_like(df_lp.comet_good_v25e1, df_lp.comet_bad_v25e1)
    t_v29e1 = compute_kendall_tau_like(df_lp.comet_good_v29e1, df_lp.comet_bad_v29e1)
    t_v83e1 = compute_kendall_tau_like(df_lp.comet_good_v83e1, df_lp.comet_bad_v83e1)
    
    print(np.round(np.mean(t_bleu), 3))
    print(np.round(np.mean(t_chrf), 3))
    print(np.round(np.mean(t_v24e1), 3))
    print(np.round(np.mean(t_ensemble), 3))
    print(np.round(np.mean(t_v25e1), 3))
    print(np.round(np.mean(t_v29e1), 3))
    print(np.round(np.mean(t_v83e1), 3))
    
    all_t_ensemble.append(t_ensemble)
    all_t_bleu.append(t_bleu)
    all_t_chrf.append(t_chrf)
    all_t_v24e1.append(t_v24e1)
    all_t_v25e1.append(t_v25e1)
    all_t_v29e1.append(t_v29e1)
    all_t_v83e1.append(t_v83e1)

print('avg bleu: ', np.round(np.mean(all_t_bleu), 3))
print('avg chrf: ', np.round(np.mean(all_t_chrf), 3))
print('avg v24e1: ', np.round(np.mean(all_t_v24e1), 3))
print('avg ensemble: ', np.round(np.mean(all_t_ensemble), 3))
print('avg v25e1: ', np.round(np.mean(all_t_v25e1), 3))
print('avg v29e1: ', np.round(np.mean(all_t_v29e1), 3))
print('avg v83e1: ', np.round(np.mean(all_t_v83e1), 3))

lp:  enxx
0.034
0.329
0.201
0.34
0.256
0.183
0.206
lp:  xxen
-0.37
-0.046
0.283
0.26
0.329
0.222
0.285
lp:  xxyy
-0.124
0.097
0.105
0.115
0.204
0.088
0.104
avg bleu:  -0.153
avg chrf:  0.127
avg v24e1:  0.196
avg ensemble:  0.238
avg v25e1:  0.263
avg v29e1:  0.164
avg v83e1:  0.198


## compute features: bleu and chrf

In [27]:
aces.iloc[:2]

Unnamed: 0,source,good_translation,incorrect_translation,reference,phenomena,langpair,bleu_good,bleu_bad,chrf_good,chrf_bad,...,comet_good_v25e1,comet_bad_v25e1,comet_good_v29e1,comet_bad_v29e1,comet_good_v83e1,comet_bad_v83e1,ensemble_good,ensemble_bad,phenomena_top,lp_flag
0,Proper nutritional practices alone cannot gene...,Las prácticas nutricionales adecuadas por sí s...,Las prácticas nutricionales adecuadas por sí s...,No es posible que las prácticas nutricionales ...,addition,en-es,27.991048,21.331701,67.615189,66.2577,...,0.7771773338317871,0.7535963654518127,0.9341334700584412,0.9143527150154114,0.6153641939163208,0.6119881272315979,0.345919,0.330656,addition,enxx
1,"has geographic variations, where the age limi...","tiene variaciones geográficas, donde el límite...","La definición tiene variaciones geográficas, d...","varía geográficamente, donde el límite de eda...",addition,en-es,53.705729,49.124158,66.61339,65.662548,...,0.7810795903205872,0.7246838212013245,0.8212878108024597,0.7988062500953674,0.6024278402328491,0.5628837943077087,0.256522,0.258462,addition,enxx


In [6]:
# compute bleu

aces_good_scores_bleu = []
refs_aces = aces.reference.tolist()
good_mts_aces = aces.good_translation.tolist()
bad_mts_aces = aces.incorrect_translation.tolist()

for i in tqdm(range(len(good_mts_aces))):
    aces_good_scores_bleu.append(sacrebleu.sentence_bleu(good_mts_aces[i], [refs_aces[i]]))
    
aces_good_scores_bleu = np.array([i.score for i in aces_good_scores_bleu])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_scores_bleu.txt", "w") as f:
    for i in aces_good_scores_bleu:
        print(i, file=f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36476/36476 [00:07<00:00, 4760.18it/s]


In [7]:
aces_bad_scores_bleu = []
for i in tqdm(range(len(bad_mts_aces))):
    aces_bad_scores_bleu.append(sacrebleu.sentence_bleu(bad_mts_aces[i], [refs_aces[i]]))
    
aces_bad_scores_bleu = np.array([i.score for i in aces_bad_scores_bleu])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_scores_bleu.txt", "w") as f:
    for i in aces_bad_scores_bleu:
        print(i, file=f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36476/36476 [00:07<00:00, 5100.27it/s]


In [9]:
# compute chrf

aces_good_scores_chrf = []
refs_aces = aces.reference.tolist()
good_mts_aces = aces.good_translation.tolist()
bad_mts_aces = aces.incorrect_translation.tolist()

for i in tqdm(range(len(good_mts_aces))):
    aces_good_scores_chrf.append(sacrebleu.sentence_chrf(good_mts_aces[i], [refs_aces[i]]))
    
aces_good_scores_chrf = np.array([i.score for i in aces_good_scores_chrf])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_scores_chrf.txt", "w") as f:
    for i in aces_good_scores_chrf:
        print(i, file=f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36476/36476 [00:11<00:00, 3289.71it/s]


In [10]:
aces_bad_scores_chrf = []
for i in tqdm(range(len(bad_mts_aces))):
    aces_bad_scores_chrf.append(sacrebleu.sentence_chrf(bad_mts_aces[i], [refs_aces[i]]))
    
aces_bad_scores_chrf = np.array([i.score for i in aces_bad_scores_chrf])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_scores_chrf.txt", "w") as f:
    for i in aces_bad_scores_chrf:
        print(i, file=f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36476/36476 [00:11<00:00, 3180.02it/s]


In [11]:
aces_good_feats = pd.DataFrame(data=np.array([aces_good_scores_bleu, aces_good_scores_chrf]).T, columns=['f1', 'f2'])
aces_good_feats.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_good_features.csv', index=None)

aces_bad_feats = pd.DataFrame(data=np.array([aces_bad_scores_bleu, aces_bad_scores_chrf]).T, columns=['f1', 'f2'])
aces_bad_feats.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/aces_predictions/aces_bad_features.csv', index=None)

In [14]:
aces_good_feats.head()

Unnamed: 0,f1,f2
0,27.991048,67.615189
1,53.705729,66.61339
2,4.57719,37.642203
3,21.202635,46.3682
4,29.693398,71.963086
