In [1]:
import pandas as pd
from tqdm import tqdm
import json
from scipy import stats
import itertools
import numpy as np
import sacrebleu

'source': a string containing the text that needs to be translated     
'good-translation': possible translation of the source sentence     
'incorrect-translation': translation of the source sentence that contains an error or phenomenon of interest     
'reference': the gold standard translation     
'phenomena': the type of error or phenomena being studied in the example     
'langpair': the source language and the target language pair of the example     
Note that the good-translation may not be free of errors but it is a better translation than the incorrect-translation     

In [2]:
# specify your home path
home_path = '/home/glushkovato/robustness'

path = home_path + '/robust_MT_evaluation/data/test/robustness/demetr.csv'
demetr = pd.read_csv(path)
demetr.columns = ['src', 'ref', 'good_translation', 'incorrect_translation', 'severity', 'phenomena', 'langpair']
demetr.iloc[:3]

Unnamed: 0,src,ref,good_translation,incorrect_translation,severity,phenomena,langpair
0,在食用受污染宠物食品后死亡的宠物的尿样中，均发现了氰尿酸和三聚氰胺。,Both cyanuric acid and melamine were found in ...,Cyanuric acid and melamine were both found in ...,Cyanuric acid and melamine were both in found ...,minor,two adjacent word swaps,chinese_simple
1,通过红外光谱 (FTIR) 进行比较后发现，这些结晶的成分与在受影响宠物的尿液中发现的结晶成...,The composition of these crystals matches thos...,A comparison by infrared spectroscopy (FTIR) r...,A comparison by infrared spectroscopy (FTIR) r...,minor,two adjacent word swaps,chinese_simple
2,我不知道你有没有意识到，从中美洲进口到这个国家的大部分货物都是免关税的。,"I don't know if you realize it or not, but mos...",I don't know if you realize that most of the g...,I don't know if you realize that most of the g...,minor,two adjacent word swaps,chinese_simple


In [3]:
critical = demetr[demetr.severity == 'critical']
critical.phenomena.unique()

array(['changing a word to its antonym (noun, adv, adj, verb)',
       'affirmative to negation and negation to affirmative', 'codemix',
       'remove noun which is not the sentence subject (head)',
       'removes content verb',
       'shuffled words keeping the sentence features (capitalization, punctuation)',
       'adding a word that does not break the grammaticality of the sentence but which affects the meaning in a significant way',
       'removes adj or adv', 'remove the head of the subject NP',
       'change of gender pronoun (e.g., "he" to "she")',
       'remove random named entity',
       'numbers changed in a reasonable range',
       'replace NE with a regard to the entity type (e.g., PER for PER)'],
      dtype=object)

In [4]:
demetr.incorrect_translation = demetr.incorrect_translation.apply(lambda x: x.replace('\n', ' '))

In [5]:
demetr.shape

(30320, 7)

In [6]:
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_source.txt", "w") as f:
    for i in demetr.src.tolist():
        print(i, file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_good_translation.txt", "w") as f:
    for i in demetr.good_translation.tolist():
        print(i, file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_bad_translation.txt", "w") as f:
    for i in demetr.incorrect_translation.tolist():
        print(i.replace('\n', ' '), file=f)

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_reference.txt", "w") as f:
    for i in demetr.ref.tolist():
        print(i, file=f)
        
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_phenomena.txt", "w") as f:
    for i in demetr.phenomena.tolist():
        print(i, file=f)
        
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_lp.txt", "w") as f:
    for i in demetr.langpair.tolist():
        print(i, file=f)
        
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_severity.txt", "w") as f:
    for i in demetr.severity.tolist():
        print(i, file=f)

In [7]:
def read_json(path):
    f = open (path, "r")
    data = json.loads(f.read())
    k = list(data.keys())[0]

    src = []
    mt = []
    ref = []
    COMET_score = []

    for i in data[k]:
        src.append(i['src'])
        mt.append(i['mt'])
        ref.append(i['ref'])
        COMET_score.append(float(i['COMET']))

    f.close()
    
    df = pd.DataFrame(data=np.array([src, mt, ref, COMET_score]).T, 
                      columns=['src', 'mt', 'ref', 'comet'])
    
    return df

In [8]:
# def compute_kendall_tau_like(df):
#     '''
#     Compute correlation as Kendall Tau-like scores.
#     '''
#     concordant = (df.comet_good > df.comet_bad).sum()
#     discordant = (df.comet_good <= df.comet_bad).sum()
#     t = (concordant - discordant)/(concordant + discordant)
#     return np.round(t, 5)

In [9]:
def compute_acc(good_metric, bad_metric):
    acc = (good_metric > bad_metric).sum()/len(good_metric)
    return np.round(acc*100,2)

## compute accuracy

In [10]:
versions = ['24e1', '25e1', '29e1', '83e1']
# 24 comet
# 25 comet + aug
# 29 comet + sl-features bottleneck-64 
# 83 word-level sum


# read bleu and chrf
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_scores_bleu.txt", "r") as f:
    good_bleu = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_scores_bleu.txt", "r") as f:
    bad_bleu = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_scores_chrf.txt", "r") as f:
    good_chrf = [float(i) for i in f]
    
with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_scores_chrf.txt", "r") as f:
    bad_chrf = [float(i) for i in f]

In [11]:
demetr['bleu_good'] = good_bleu
demetr['bleu_bad'] = bad_bleu

demetr['chrf_good'] = good_chrf
demetr['chrf_bad'] = bad_chrf

In [12]:
path_good = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_output_v24e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_output_v24e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
demetr['comet_good_v24e1'] = good_tmp.comet.tolist()
demetr['comet_bad_v24e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_output_v25e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_output_v25e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
demetr['comet_good_v25e1'] = good_tmp.comet.tolist()
demetr['comet_bad_v25e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_output_v29e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_output_v29e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
demetr['comet_good_v29e1'] = good_tmp.comet.tolist()
demetr['comet_bad_v29e1'] = bad_tmp.comet.tolist()

path_good = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_output_v83e1.json'
path_bad = home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_output_v83e1.json'
good_tmp = read_json(path_good)
bad_tmp = read_json(path_bad)
demetr['comet_good_v83e1'] = good_tmp.comet.tolist()
demetr['comet_bad_v83e1'] = bad_tmp.comet.tolist()

In [13]:
def norm(x):
    mean = np.mean(x)
    std = np.std(x)
    x = (x - mean)/std
    return np.nan_to_num(x)

In [14]:
def compute_norm(x):
    mean = np.mean(x)
    std = np.std(x)
    return [mean, std]

def apply_norm(mean, std, x):
    xn = (np.array(x) - mean)/std
    return np.array(xn)


bleu_mean = 28.759837809513634
bleu_std = 18.47107097319373
chrf_mean = 58.992697061544284
chrf_std = 14.286372518233168
comet_mean = 0.46782439675103793
comet_std = 0.37521584265953595

# true_scores = all_mqm
# scores_bleu = apply_norm(bleu_mean, bleu_std, all_bleu)
# scores_chrf = apply_norm(chrf_mean, chrf_std, all_chrf)
# scores_comet = apply_norm(comet_mean, comet_std, all_comet)

In [15]:
# ensemble scores

# best weights based on kendall - computed over mqm 2021
a = 0.02512562814070352
b = 0.04522613065326633
c = 0.9296482412060302

demetr['ensemble_good'] = np.mean([a*apply_norm(bleu_mean, bleu_std, demetr.bleu_good.tolist()),
                                   b*apply_norm(chrf_mean, chrf_std, demetr.chrf_good.tolist()), 
                                   c*apply_norm(comet_mean, comet_std, demetr.comet_good_v24e1.astype('float').tolist())], axis=0)

demetr['ensemble_bad'] = np.mean([a*apply_norm(bleu_mean, bleu_std, demetr.bleu_bad.tolist()),
                                  b*apply_norm(chrf_mean, chrf_std, demetr.chrf_bad.tolist()),
                                  c*apply_norm(comet_mean, comet_std, demetr.comet_bad_v24e1.astype('float').tolist())], axis=0)

demetr_minor = demetr[demetr.severity == 'minor']
acc_ensemble_minor = compute_acc(demetr_minor.ensemble_good, demetr_minor.ensemble_bad)

demetr_major = demetr[demetr.severity == 'major']
acc_ensemble_major = compute_acc(demetr_major.ensemble_good, demetr_major.ensemble_bad)

demetr_critical = demetr[demetr.severity == 'critical']
acc_ensemble_critical = compute_acc(demetr_critical.ensemble_good, demetr_critical.ensemble_bad)

demetr_base = demetr[demetr.severity == 'base']
acc_ensemble_base = compute_acc(demetr_base.ensemble_good, demetr_base.ensemble_bad)

acc_ensemble_all = compute_acc(demetr.ensemble_good, demetr.ensemble_bad)

acc_ensemble_base, acc_ensemble_critical, acc_ensemble_major, acc_ensemble_minor, acc_ensemble_all

(100.0, 96.87, 92.91, 93.77, 95.14)

## compute accuracy

In [20]:
# bleu
# chrf
# 24 comet
# 25 comet + aug
# 29 sl bottleneck-64 
# 83 word-level sum

severities = ['minor', 'major', 'critical', 'base']

for s in severities:
    demetr_minor = demetr[demetr.severity == s]
    acc_bleu = compute_acc(demetr_minor.bleu_good, demetr_minor.bleu_bad)
    acc_chrf = compute_acc(demetr_minor.chrf_good, demetr_minor.chrf_bad)
    acc_comet_good_v24e1 = compute_acc(demetr_minor.comet_good_v24e1, demetr_minor.comet_bad_v24e1)
    acc_comet_good_v25e1 = compute_acc(demetr_minor.comet_good_v25e1, demetr_minor.comet_bad_v25e1)
    acc_comet_good_v29e1 = compute_acc(demetr_minor.comet_good_v29e1, demetr_minor.comet_bad_v29e1)
    acc_comet_good_v83e1 = compute_acc(demetr_minor.comet_good_v83e1, demetr_minor.comet_bad_v83e1)


    print(s , ': ', acc_bleu, acc_chrf, acc_comet_good_v24e1, acc_comet_good_v25e1, 
          acc_comet_good_v29e1, acc_comet_good_v83e1)

minor :  72.6 80.83 92.18 92.06 94.64 96.36
major :  83.76 90.85 91.04 91.66 93.56 93.9
critical :  79.33 90.79 95.77 95.54 96.95 96.48
base :  100.0 100.0 99.3 98.6 99.3 99.2


In [21]:
# all acc
acc_bleu = compute_acc(demetr.bleu_good, demetr.bleu_bad)
acc_chrf = compute_acc(demetr.chrf_good, demetr.chrf_bad)
acc_comet_good_v24e1 = compute_acc(demetr.comet_good_v24e1, demetr.comet_bad_v24e1)
acc_comet_good_v25e1 = compute_acc(demetr.comet_good_v25e1, demetr.comet_bad_v25e1)
acc_comet_good_v29e1 = compute_acc(demetr.comet_good_v29e1, demetr.comet_bad_v29e1)
acc_comet_good_v83e1 = compute_acc(demetr.comet_good_v83e1, demetr.comet_bad_v83e1)

acc_bleu, acc_chrf, acc_comet_good_v24e1, acc_comet_good_v25e1, acc_comet_good_v29e1, acc_comet_good_v83e1

(78.52, 87.16, 93.74, 93.65, 95.59, 96.2)

## compute kendall tau

In [23]:
def compute_kendall_tau_like(comet_good, comet_bad):
    '''
    Compute correlation as Kendall Tau-like scores.
    '''
    concordant = (comet_good > comet_bad).sum()
    discordant = (comet_good <= comet_bad).sum()
    t = (concordant - discordant)/(concordant + discordant)
    return np.round(t, 3)

In [27]:
all_t_bleu = []
all_t_chrf = []
all_t_v24e1 = []
all_t_v25e1 = []
all_t_v29e1 = []
all_t_v83e1 = []
all_t_ensemble = []

lps = demetr.langpair.unique()
for lp in lps:
    print('lp: ', lp)
    df_lp = demetr[demetr.langpair == lp]
    t_ensemble = compute_kendall_tau_like(df_lp.ensemble_good, df_lp.ensemble_bad)
    t_bleu = compute_kendall_tau_like(df_lp.bleu_good, df_lp.bleu_bad)
    t_chrf = compute_kendall_tau_like(df_lp.chrf_good, df_lp.chrf_bad)
    t_v24e1 = compute_kendall_tau_like(df_lp.comet_good_v24e1, df_lp.comet_bad_v24e1)
    t_v25e1 = compute_kendall_tau_like(df_lp.comet_good_v25e1, df_lp.comet_bad_v25e1)
    t_v29e1 = compute_kendall_tau_like(df_lp.comet_good_v29e1, df_lp.comet_bad_v29e1)
    t_v83e1 = compute_kendall_tau_like(df_lp.comet_good_v83e1, df_lp.comet_bad_v83e1)
    
    print(np.round(np.mean(t_bleu), 3))
    print(np.round(np.mean(t_chrf), 3))
    print(np.round(np.mean(t_v24e1), 3))
    print(np.round(np.mean(t_ensemble), 3))
    print(np.round(np.mean(t_v25e1), 3))
    print(np.round(np.mean(t_v29e1), 3))
    print(np.round(np.mean(t_v83e1), 3))
    
    all_t_bleu.append(t_bleu)
    all_t_chrf.append(t_chrf)
    all_t_v24e1.append(t_v24e1)
    all_t_ensemble.append(t_ensemble)
    all_t_v25e1.append(t_v25e1)
    all_t_v29e1.append(t_v29e1)
    all_t_v83e1.append(t_v83e1)

print('avg bleu: ', np.round(np.mean(all_t_bleu), 3))
print('avg chrf: ', np.round(np.mean(all_t_chrf), 3))
print('avg v24e1: ', np.round(np.mean(all_t_v24e1), 3))
print('avg ensemble: ', np.round(np.mean(all_t_ensemble), 3))
print('avg v25e1: ', np.round(np.mean(all_t_v25e1), 3))
print('avg v29e1: ', np.round(np.mean(all_t_v29e1), 3))
print('avg v83e1: ', np.round(np.mean(all_t_v83e1), 3))

lp:  chinese_simple
0.505
0.684
0.818
0.855
0.817
0.866
0.872
lp:  german
0.655
0.802
0.909
0.926
0.917
0.942
0.957
lp:  hindi
0.616
0.768
0.9
0.92
0.925
0.929
0.945
lp:  japanese
0.521
0.722
0.85
0.883
0.83
0.907
0.891
lp:  polish
0.533
0.703
0.818
0.88
0.775
0.863
0.877
lp:  russian
0.552
0.724
0.898
0.91
0.894
0.95
0.949
lp:  czech
0.541
0.755
0.875
0.917
0.863
0.87
0.92
lp:  french
0.664
0.794
0.892
0.915
0.926
0.945
0.951
lp:  spanish
0.516
0.704
0.877
0.899
0.877
0.912
0.935
lp:  italian
0.601
0.774
0.912
0.924
0.906
0.936
0.945
avg bleu:  0.57
avg chrf:  0.743
avg v24e1:  0.875
avg ensemble:  0.903
avg v25e1:  0.873
avg v29e1:  0.912
avg v83e1:  0.924


In [28]:
demetr.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/demetr_with_preds.csv', index=None)

## compute features for DEMETR: bleu and chrf

In [30]:
demetr.iloc[:1]

Unnamed: 0,src,ref,good_translation,incorrect_translation,severity,phenomena,langpair,bleu_good,bleu_bad,chrf_good,...,comet_good_v24e1,comet_bad_v24e1,comet_good_v25e1,comet_bad_v25e1,comet_good_v29e1,comet_bad_v29e1,comet_good_v83e1,comet_bad_v83e1,ensemble_good,ensemble_bad
0,在食用受污染宠物食品后死亡的宠物的尿样中，均发现了氰尿酸和三聚氰胺。,Both cyanuric acid and melamine were found in ...,Cyanuric acid and melamine were both found in ...,Cyanuric acid and melamine were both in found ...,minor,two adjacent word swaps,chinese_simple,39.035944,30.143353,73.376263,...,0.8044230341911316,0.7144668102264404,0.964411199092865,0.8717182278633118,0.9330844283103944,0.6408037543296814,0.5853949189186096,0.404782623052597,0.297827,0.215072


In [161]:
# compute bleu

demetr_good_scores_bleu = []
refs_demetr = demetr.ref.tolist()
good_mts_demetr = demetr.good_translation.tolist()
bad_mts_demetr = demetr.incorrect_translation.tolist()

for i in tqdm(range(len(good_mts_demetr))):
    demetr_good_scores_bleu.append(sacrebleu.sentence_bleu(good_mts_demetr[i], [refs_demetr[i]]))
    
demetr_good_scores_bleu = np.array([i.score for i in demetr_good_scores_bleu])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_scores_bleu.txt", "w") as f:
    for i in demetr_good_scores_bleu:
        print(i, file=f)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 30320/30320 [00:07<00:00, 4041.17it/s]


In [162]:
demetr_bad_scores_bleu = []
for i in tqdm(range(len(bad_mts_demetr))):
    demetr_bad_scores_bleu.append(sacrebleu.sentence_bleu(bad_mts_demetr[i], [refs_demetr[i]]))
    
demetr_bad_scores_bleu = np.array([i.score for i in demetr_bad_scores_bleu])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_scores_bleu.txt", "w") as f:
    for i in demetr_bad_scores_bleu:
        print(i, file=f)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 30320/30320 [00:06<00:00, 4655.32it/s]


In [163]:
# compute chrf

demetr_good_scores_chrf = []
refs_demetr = demetr.ref.tolist()
good_mts_demetr = demetr.good_translation.tolist()
bad_mts_demetr = demetr.incorrect_translation.tolist()

for i in tqdm(range(len(good_mts_demetr))):
    demetr_good_scores_chrf.append(sacrebleu.sentence_chrf(good_mts_demetr[i], [refs_demetr[i]]))
    
demetr_good_scores_chrf = np.array([i.score for i in demetr_good_scores_chrf])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_scores_chrf.txt", "w") as f:
    for i in demetr_good_scores_chrf:
        print(i, file=f)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 30320/30320 [00:10<00:00, 2848.38it/s]


In [164]:
demetr_bad_scores_chrf = []
for i in tqdm(range(len(bad_mts_demetr))):
    demetr_bad_scores_chrf.append(sacrebleu.sentence_chrf(bad_mts_demetr[i], [refs_demetr[i]]))
    
demetr_bad_scores_chrf = np.array([i.score for i in demetr_bad_scores_chrf])

with open(home_path + "/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_scores_chrf.txt", "w") as f:
    for i in demetr_bad_scores_chrf:
        print(i, file=f)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 30320/30320 [00:10<00:00, 2959.39it/s]


In [167]:
demetr_good_feats = pd.DataFrame(data=np.array([demetr_good_scores_bleu, demetr_good_scores_chrf]).T, columns=['f1', 'f2'])
demetr_good_feats.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_good_features.csv', index=None, header=None)

demetr_bad_feats = pd.DataFrame(data=np.array([demetr_bad_scores_bleu, demetr_bad_scores_chrf]).T, columns=['f1', 'f2'])
demetr_bad_feats.to_csv(home_path + '/robust_MT_evaluation/data/test/robustness/demetr_predictions/demetr_bad_features.csv', index=None, header=None)

In [168]:
demetr_good_feats.head()

Unnamed: 0,f1,f2
0,39.035944,73.376263
1,56.427617,84.359787
2,40.09658,71.701801
3,47.58104,71.959875
4,5.868917,55.847598
