## Evaluation on MQM 2022

In [1]:
import pandas as pd
from tqdm import tqdm
import json
from scipy import stats
import itertools
import numpy as np
import sacrebleu

In [2]:
def read_json(path):
    f = open (path, "r")
    data = json.loads(f.read())
    k = list(data.keys())[0]

    src = []
    mt = []
    ref = []
    COMET_score = []

    for i in data[k]:
        src.append(i['src'])
        mt.append(i['mt'])
        ref.append(i['ref'])
        COMET_score.append(float(i['COMET']))

    f.close()
    
    df = pd.DataFrame(data=np.array([src, mt, ref, COMET_score]).T, 
                      columns=['src', 'mt', 'ref', 'comet'])
    
    return df

## en-de

In [49]:
# specify your home path
home_path = '/home/glushkovato/robustness'

conversation = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-de/conversation.csv')
ecommerce = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-de/ecommerce.csv')
news = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-de/news.csv')
social = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-de/social.csv')

In [50]:
domain_names = ['conversation', 'ecommerce', 'news', 'social']
domain_dfs = [conversation, ecommerce, news, social]

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "_src.txt", "w") as f:
        for i in domain.src.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "_mt.txt", "w") as f:
        for i in domain.mt.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "_ref.txt", "w") as f:
        for i in domain.ref.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "_score.txt", "w") as f:
        for i in domain.score.tolist():
            print(i, file=f)

conversation
ecommerce
news
social


In [5]:
def norm(x):
    mean = np.mean(x)
    std = np.std(x)
    x = (x - mean)/std
    return np.nan_to_num(x)

In [6]:
def compute_norm(x):
    mean = np.mean(x)
    std = np.std(x)
    return [mean, std]

def apply_norm(mean, std, x):
    xn = (np.array(x) - mean)/std
    return np.array(xn)


bleu_mean = 28.759837809513634
bleu_std = 18.47107097319373
chrf_mean = 58.992697061544284
chrf_std = 14.286372518233168
comet_mean = 0.46782439675103793
comet_std = 0.37521584265953595

In [7]:
def compute_correlations(df):
    pearson = np.round(stats.pearsonr(df.comet, df.mqm), 3)
    spearman = np.round(stats.spearmanr(df.comet, df.mqm), 3)
    kendall = np.round(stats.kendalltau(df.comet, df.mqm), 3)
    return pearson[0], spearman[0], kendall[0]

In [8]:
def compute_correlations2(df):
    pearson = np.round(stats.pearsonr(df.metric, df.mqm), 3)
    spearman = np.round(stats.spearmanr(df.metric, df.mqm), 3)
    kendall = np.round(stats.kendalltau(df.metric, df.mqm), 3)
    return pearson[0], spearman[0], kendall[0]

In [9]:
versions = ['24', '25', '29', '83']
# 24 COMET
# 25 COMET + aug
# 29 COMET + SL-feats
# 83 COMET + WL-tags

In [51]:
pearsons = []
spearmans = []
kendalls = []

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "_chrf_scores.txt"
    with open(path, "r") as f:
        chrf_scores = [float(i) for i in f]
    
    domain_output = domain[['src', 'mt', 'ref', 'score']]
    domain_output['metric'] = chrf_scores
    domain_output['mqm'] = domain_output.score.tolist()
    domain_output.mqm = domain_output.mqm.astype(float)

    p, s, k = compute_correlations2(domain_output)
    pearsons.append(p)
    spearmans.append(s)
    kendalls.append(k)
    print(p, s, k)

conversation
0.285 0.337 0.257
ecommerce
0.222 0.278 0.212
news
0.26 0.273 0.202
social
0.22 0.222 0.168


In [53]:
pearsons = []
spearmans = []
kendalls = []

for v in versions:
    print('model: ', v)
    for i, domain in enumerate(domain_dfs):
        print(domain_names[i])
        name = domain_names[i]

        path = home_path + '/robust_MT_evaluation/data/test/mqm2022/en-de/predictions/' + name + '_output_v' + v + 'e1.json'
        domain_output = read_json(path)
        domain_output['mqm'] = domain.score.astype(float).tolist()
        domain_output.comet = domain_output.comet.astype(float)

        p, s, k = compute_correlations(domain_output)
        pearsons.append(p)
        spearmans.append(s)
        kendalls.append(k)
        print(p, s, k)
    print()    

model:  24
conversation
0.371 0.401 0.308
ecommerce
0.376 0.421 0.326
news
0.522 0.478 0.361
social
0.367 0.389 0.297

model:  25
conversation
0.378 0.385 0.296
ecommerce
0.38 0.403 0.311
news
0.492 0.438 0.33
social
0.375 0.361 0.276

model:  29
conversation
0.379 0.404 0.31
ecommerce
0.383 0.416 0.322
news
0.506 0.471 0.355
social
0.382 0.386 0.294

model:  83
conversation
0.4 0.409 0.314
ecommerce
0.341 0.417 0.322
news
0.526 0.486 0.369
social
0.351 0.384 0.293



In [36]:
pearsons = []
spearmans = []
kendalls = []
v = '24' # baseline COMET

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path_bleu = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "bleu_scores.txt"
    with open(path_bleu, "r") as f:
        bleu_scores = [float(i) for i in f]
        
    path_chrf = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/en-de_" + name + "chrf_scores.txt"
    with open(path_chrf, "r") as f:
        chrf_scores = [float(i) for i in f]
        
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-de/predictions/' + name + '_output_v' + v + 'e1.json'
    domain_output = read_json(path)
    domain_output['mqm'] = domain.score.astype(float).tolist()
    domain_output.comet = domain_output.comet.astype(float)
    domain_output['bleu'] = bleu_scores
    domain_output['chrf'] = chrf_scores
    
    # best weights based on kendall - computed over mqm 2021
    a = 0.02512562814070352
    b = 0.04522613065326633
    c = 0.9296482412060302
    

    domain_output['ensemble_norm_w'] = np.mean([a*apply_norm(bleu_mean, bleu_std, domain_output['bleu'].tolist()), 
                                                b*apply_norm(chrf_mean, chrf_std, domain_output['chrf'].tolist()), 
                                                c*apply_norm(comet_mean, comet_std, domain_output['comet'].tolist())], axis=0)
    
    p = np.round(stats.pearsonr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    s = np.round(stats.spearmanr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    k = np.round(stats.kendalltau(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    print(p[0], s[0], k[0])

conversation
0.376 0.403 0.309
ecommerce
0.373 0.411 0.318
news
0.521 0.472 0.356
social
0.367 0.383 0.292


## en-ru

In [56]:
conversation = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/conversation.csv')
ecommerce = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/ecommerce.csv')
news = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/news.csv')
social = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/social.csv')

In [57]:
domain_names = ['conversation', 'ecommerce', 'news', 'social']
domain_dfs = [conversation, ecommerce, news, social]

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    with open("/home/glushkovato/robustness/COMET/data/test/testsets/data/mqm/en-ru/en-ru_" + name + "_src.txt", "w") as f:
        for i in domain.src.tolist():
            print(i, file=f)

    with open("/home/glushkovato/robustness/COMET/data/test/testsets/data/mqm/en-ru/en-ru_" + name + "_mt.txt", "w") as f:
        for i in domain.mt.tolist():
            print(i, file=f)

    with open("/home/glushkovato/robustness/COMET/data/test/testsets/data/mqm/en-ru/en-ru_" + name + "_ref.txt", "w") as f:
        for i in domain.ref.tolist():
            print(i, file=f)

    with open("/home/glushkovato/robustness/COMET/data/test/testsets/data/mqm/en-ru/en-ru_" + name + "_score.txt", "w") as f:
        for i in domain.score.tolist():
            print(i, file=f)

conversation
ecommerce
news
social


In [59]:
pearsons = []
spearmans = []
kendalls = []

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-ru/en-ru_" + name + "_bleu_scores.txt"
    with open(path, "r") as f:
        bleu_scores = [float(i) for i in f]
    
    domain_output = domain[['src', 'mt', 'ref', 'score']]
    domain_output['metric'] = bleu_scores
    domain_output['mqm'] = domain_output.score.tolist()
    domain_output.mqm = domain_output.mqm.astype(float)

    p, s, k = compute_correlations2(domain_output)
    pearsons.append(p)
    spearmans.append(s)
    kendalls.append(k)
    print(p, s, k)

conversation
0.155 0.183 0.14
ecommerce
0.249 0.276 0.202
news
0.169 0.171 0.125
social
0.213 0.212 0.152


In [60]:
pearsons = []
spearmans = []
kendalls = []

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-ru/en-ru_" + name + "_chrf_scores.txt"
    with open(path, "r") as f:
        chrf_scores = [float(i) for i in f]
    
    domain_output = domain[['src', 'mt', 'ref', 'score']]
    domain_output['metric'] = chrf_scores
    domain_output['mqm'] = domain_output.score.tolist()
    domain_output.mqm = domain_output.mqm.astype(float)

    p, s, k = compute_correlations2(domain_output)
    pearsons.append(p)
    spearmans.append(s)
    kendalls.append(k)
    print(p, s, k)

conversation
0.185 0.23 0.175
ecommerce
0.287 0.303 0.221
news
0.23 0.224 0.164
social
0.143 0.186 0.132


In [61]:
pearsons = []
spearmans = []
kendalls = []

for v in versions:
    print('model: ', v)
    for i, domain in enumerate(domain_dfs):
        print(domain_names[i])
        name = domain_names[i]

        path = home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/predictions/' + name + '_output_v' + v + 'e1.json'
        domain_output = read_json(path)
        domain_output['mqm'] = domain.score.astype(float).tolist()
        domain_output.comet = domain_output.comet.astype(float)

        p, s, k = compute_correlations(domain_output)
        pearsons.append(p)
        spearmans.append(s)
        kendalls.append(k)
        print(p, s, k)
    print() 

model:  24
conversation
0.372 0.399 0.305
ecommerce
0.488 0.502 0.372
news
0.469 0.499 0.373
social
0.324 0.425 0.305

model:  25
conversation
0.418 0.427 0.328
ecommerce
0.51 0.514 0.382
news
0.464 0.49 0.366
social
0.371 0.455 0.33

model:  29
conversation
0.35 0.389 0.298
ecommerce
0.507 0.499 0.369
news
0.477 0.514 0.384
social
0.343 0.46 0.332

model:  83
conversation
0.4 0.428 0.328
ecommerce
0.481 0.528 0.391
news
0.448 0.495 0.37
social
0.385 0.483 0.349



In [63]:
pearsons = []
spearmans = []
kendalls = []
v = '24' # baseline model trained on 1720 original data

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path_bleu = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-ru/en-ru_" + name + "_bleu_scores.txt"
    with open(path_bleu, "r") as f:
        bleu_scores = [float(i) for i in f]
        
    path_chrf = home_path + "/robust_MT_evaluation/data/test/mqm2022/en-ru/en-ru_" + name + "_chrf_scores.txt"
    with open(path_chrf, "r") as f:
        chrf_scores = [float(i) for i in f]
        
    path = home_path + '/robust_MT_evaluation/data/test/mqm2022/en-ru/predictions/' + name + '_output_v' + v + 'e1.json'
    domain_output = read_json(path)
    domain_output['mqm'] = domain.score.astype(float).tolist()
    domain_output.comet = domain_output.comet.astype(float)
    domain_output['bleu'] = bleu_scores
    domain_output['chrf'] = chrf_scores
    
    # best weights based on kendall - computed over mqm 2021 
    a = 0.02512562814070352
    b = 0.04522613065326633
    c = 0.9296482412060302
    
    # best weights based on pearson - computed over mqm 2021 (!)
#     a = 0.07035175879396985
#     b = 0.0
#     c = 0.9296482412060302
    
    # best weights based on spearman - computed over mqm 2021
#     a = 0.01507537688442211
#     b = 0.05527638190954774
#     c = 0.9296482412060302
    


    domain_output['ensemble_norm_w'] = np.mean([a*apply_norm(bleu_mean, bleu_std, domain_output['bleu'].tolist()), 
                                                b*apply_norm(chrf_mean, chrf_std, domain_output['chrf'].tolist()), 
                                                c*apply_norm(comet_mean, comet_std, domain_output['comet'].tolist())], axis=0)
    
    p = np.round(stats.pearsonr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    s = np.round(stats.spearmanr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    k = np.round(stats.kendalltau(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    print(p[0], s[0], k[0])

conversation
0.368 0.398 0.304
ecommerce
0.487 0.499 0.369
news
0.466 0.491 0.366
social
0.323 0.418 0.3


## zh-en

In [64]:
conversation = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/conversation.csv')
ecommerce = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/ecommerce.csv')
news = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/news.csv')
social = pd.read_csv(home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/social.csv')
# social.dropna(inplace=True)

In [65]:
social.mt = social.mt.fillna(' ')

In [67]:
domain_names = ['conversation', 'ecommerce', 'news', 'social']
domain_dfs = [conversation, ecommerce, news, social]

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_src.txt", "w") as f:
        for i in domain.src.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_mt.txt", "w") as f:
        for i in domain.mt.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_ref.txt", "w") as f:
        for i in domain.ref.tolist():
            print(i, file=f)

    with open(home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_score.txt", "w") as f:
        for i in domain.score.tolist():
            print(i, file=f)

conversation
ecommerce
news
social


In [68]:
pearsons = []
spearmans = []
kendalls = []

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_bleu_scores.txt"
    with open(path, "r") as f:
        bleu_scores = [float(i) for i in f]
    
    domain_output = domain[['src', 'mt', 'ref', 'score']]
    domain_output['metric'] = bleu_scores
    domain_output['mqm'] = domain_output.score.tolist()
    domain_output.mqm = domain_output.mqm.astype(float)

    p, s, k = compute_correlations2(domain_output)
    pearsons.append(p)
    spearmans.append(s)
    kendalls.append(k)
    print(p, s, k)

conversation
0.16 0.166 0.125
ecommerce
0.22 0.241 0.174
news
0.097 0.063 0.046
social
0.161 0.219 0.162


In [69]:
pearsons = []
spearmans = []
kendalls = []

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path = home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_chrf_scores.txt"
    with open(path, "r") as f:
        chrf_scores = [float(i) for i in f]
    
    domain_output = domain[['src', 'mt', 'ref', 'score']]
    domain_output['metric'] = chrf_scores
    domain_output['mqm'] = domain_output.score.tolist()
    domain_output.mqm = domain_output.mqm.astype(float)

    p, s, k = compute_correlations2(domain_output)
    pearsons.append(p)
    spearmans.append(s)
    kendalls.append(k)
    print(p, s, k)

conversation
0.206 0.211 0.16
ecommerce
0.23 0.259 0.187
news
0.078 0.057 0.042
social
0.177 0.256 0.19


In [70]:
pearsons = []
spearmans = []
kendalls = []

for v in versions:
    print('model: ', v)
    for i, domain in enumerate(domain_dfs):
        print(domain_names[i])
        name = domain_names[i]

        path = home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/predictions/' + name + '_output_v' + v + 'e1.json'
        domain_output = read_json(path)
        domain_output['mqm'] = domain.score.astype(float).tolist()
        domain_output.comet = domain_output.comet.astype(float)

        p, s, k = compute_correlations(domain_output)
        pearsons.append(p)
        spearmans.append(s)
        kendalls.append(k)
        print(p, s, k)
    print()

model:  24
conversation
0.34 0.375 0.288
ecommerce
0.391 0.449 0.327
news
0.34 0.364 0.27
social
0.351 0.424 0.319

model:  25
conversation
0.37 0.385 0.295
ecommerce
0.438 0.467 0.342
news
0.383 0.393 0.291
social
0.358 0.418 0.313

model:  29
conversation
0.343 0.37 0.283
ecommerce
0.4 0.459 0.335
news
0.364 0.373 0.276
social
0.343 0.419 0.315

model:  83
conversation
0.358 0.389 0.298
ecommerce
0.44 0.487 0.357
news
0.359 0.394 0.292
social
0.373 0.439 0.33



In [72]:
pearsons = []
spearmans = []
kendalls = []
v = '24' # baseline model trained on 1720 original data

for i, domain in enumerate(domain_dfs):
    print(domain_names[i])
    name = domain_names[i]
    
    path_bleu = home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_bleu_scores.txt"
    with open(path_bleu, "r") as f:
        bleu_scores = [float(i) for i in f]
        
    path_chrf = home_path + "/robust_MT_evaluation/data/test/mqm2022/zh-en/zh-en_" + name + "_chrf_scores.txt"
    with open(path_chrf, "r") as f:
        chrf_scores = [float(i) for i in f]
        
    path = home_path + '/robust_MT_evaluation/data/test/mqm2022/zh-en/predictions/' + name + '_output_v' + v + 'e1.json'
    domain_output = read_json(path)
    domain_output['mqm'] = domain.score.astype(float).tolist()
    domain_output.comet = domain_output.comet.astype(float)
    domain_output['bleu'] = bleu_scores
    domain_output['chrf'] = chrf_scores
    
#     # best weights based on kendall - computed over mqm 2021
    a = 0.02512562814070352
    b = 0.04522613065326633
    c = 0.9296482412060302
    
    # best weights based on pearson - computed over mqm 2021 (!)
#     a = 0.07035175879396985
#     b = 0.0
#     c = 0.9296482412060302
    
#     # best weights based on spearman - computed over mqm 2021
#     a = 0.01507537688442211
#     b = 0.05527638190954774
#     c = 0.9296482412060302


    domain_output['ensemble_norm_w'] = np.mean([a*apply_norm(bleu_mean, bleu_std, domain_output['bleu'].tolist()), 
                                                b*apply_norm(chrf_mean, chrf_std, domain_output['chrf'].tolist()), 
                                                c*apply_norm(comet_mean, comet_std, domain_output['comet'].tolist())], axis=0)

    
    p = np.round(stats.pearsonr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    s = np.round(stats.spearmanr(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    k = np.round(stats.kendalltau(domain_output.ensemble_norm_w, domain_output.mqm), 3)
    print(p[0], s[0], k[0])

conversation
0.338 0.369 0.283
ecommerce
0.39 0.446 0.325
news
0.332 0.351 0.26
social
0.346 0.422 0.317
