This is a demo colab for MTME. It assumes you have mt_metrics_eval installed on your runtime, and have downloaded the data onto that machine. Run the cells below in order.

# Preliminaries

In [None]:
# @title Imports

import numpy as np
import scipy.stats

from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import stats
from mt_metrics_eval import tasks

In [None]:
# @title Print all available evalsets

for testset in meta_info.DATA:
  print(f'{testset}:', ' '.join(lp for lp in meta_info.DATA[testset]))


In [None]:
# @title Load data for WMT21 language pairs scored with MQM

all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
  if not testset.startswith('wmt21'): continue
  for lp in meta_info.DATA[testset]:
    if 'mqm' in meta_info.DATA[testset][lp].std_gold.values():
      all_evs[f'{testset}/{lp}'] = data.EvalSet(testset, lp, True)

print('\n'.join(all_evs.keys()))

In [None]:
# @title Print summaries for all loaded evalsets

print(f'{"name":<20}  segs sys metrics gold  refs std')
for name, evs in all_evs.items():
  nsegs = len(evs.src)
  nsys = len(evs.sys_names)
  nmetrics = len(evs.metric_basenames)
  gold = evs.StdHumanScoreName('sys')
  nrefs = len(evs.ref_names)
  std_ref = evs.std_ref

  print(f'{name:<20} {nsegs:5d} {nsys:3d} {nmetrics:7d} '
        f'{gold:5} {nrefs:4d} {std_ref}')

# Comparing metrics

In [None]:
# @title Set up for comparing metrics

# There are many different ways to evaluate the performance of MT metrics. The
# most obvious question is what correlation statistic we should use to capture
# the similarity between a vector of metric scores and a vector of gold scores
# (human ratings). A less obvious question is where those vectors come from.
# We'll defer the choice of correlation statistic to later cells, and begin
# by setting some parameters that precisely define the vectors we're interested
# in comparing.

# Use all evalsets that we've loaded.
evs_list = all_evs.values()

# Choose the version of each metric that uses the standard reference for each
# evalset.
main_refs = [{evs.std_ref} for evs in evs_list]

# Some alternative references are known to be close to the standard reference.
# Don't include these among systems to be scored if we are including 'human'
# systems. The only currently known instance is refB in wmt21.news/en-de,
# which is similar to the standard refC.
close_refs = [{'refB'} if k == 'wmt21.news/en-de' else set() for k in all_evs]

# Include 'human' systems (ie, reference translations) among systems to be
# scored. This can make the task more challenging, since some metrics are
# biased against less literal references.
include_human = True

# Don't include systems considered to be outliers. These are systems that are
# much better or worse than all other systems, so they are easy for all metrics
# to rank correctly).
include_outliers = False

# Use MQM ratings as gold scores rather than the scores provided by the main
# WMT task. Metrics tasks have used MQM for main results since 2021.
gold_name = 'mqm'

# Only compare metrics that have been designated as primary submissions. This
# removes metric variants that are similar to each other, and reduces the size
# of the comparison matrix.
primary_metrics = True

# Don't limit the results to a particular domain. In WMT21, domains are treated
# as separate test-sets, so this is a no-op (WMT22 is a different story).
domain = None

# Set the number of resampling runs for determining whether one metric is better
# than another according to the permutation test. We'll use 5 to make the demo
# finish quickly, but at least 1000 is required for stable results.
k = 5

# Set the size of blocks for 'early stopping' checks during resampling. If
# you're using k = 1000, this can speed up the computation, usually with
# only minimal changes to the results.
psd = stats.PermutationSigDiffParams(block_size = 100)

# Set the p-value for deciding wheter metrics are considered to be significantly
# different. Lower values make the test more stringent.
pval = 0.05

In [None]:
# @title Evaluate metrics using global accuracy

# Global accuracy, introduced by Kocmi et al (https://arxiv.org/abs/2107.10821)
# is a robust way to evaluate the performance of a metric across many different
# settings. The idea is to count the number of pairwise system rankings where
# the metric agrees with the gold ranking, and micro average this across all
# settings.

# The output shows the rank of each metric's significance cluster, followed
# by its accuracy, and whether it is statistically tied with (=) or better than
# (>) each lower-ranking metric.


ranks, matrix = data.CompareMetricsWithGlobalAccuracy(
    evs_list, main_refs, close_refs, include_human, include_outliers,
    gold_name, primary_metrics, domain, k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval)

In [None]:
# @title Evaluate metrics using system-level Pearson correlation

# Pearson correlation measures the degree of linear correspondence between
# metric and gold scores. Computing a single correlation across different
# evalsets isn't a great idea, so the interface forces you to choose a single
# set. We'll pick 'wmt21.news/en-de'. The part of the computation that extracts
# relevant score vectors is factored into a separate step to allow you to
# compute other correlations with these vectors.

# Notice that the ranking is quite different from the accuracy ranking, partly
# because we're using only a subset of the data, and partly because Pearson and
# accuracy measure different things. The ranking also includes two metrics that
# were automatically filtered out of the accuracy ranking because they weren't
# available for all evalsets.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'sys', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate metrics using segment-level Kendall correlation

# Kendall correlation is similar to pairwise accuracy, except that it is
# normalized differently. The function calls are identical to the previous one,
# except that we set the 'level' parameter to 'seg', and specify Kendall rather
# than Pearson. The value of the 'average_by' parameter also matters here, as it
# specifies how system x segment score matrices get converted into vectors for
# comparison. We will use 'none', which just flattens the matrices.

# The resulting ranking is similar to the ranking from accuracy. One noticeable
# difference is that the significance clusters are smaller because they are
# based on more data (much larger vectors). Notice that BLEU is absent because
# it isn't available at the segment level.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'seg', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate metrics using seg-level accuracy with optimized tie threshold.

# This is an implementation of the acc*_eq pairwise ranking accuracy proposed in
# https://arxiv.org/abs/2305.14324. This is similar to global accuracy, but it
# additionally gives metrics credit for predicting ties in gold scores, which
# arise frequently in MQM segment-level data. To avoid bias due to differences
# in scoring precision for different metrics, an optimal threshold for assigning
# ties is automatically computed for each metric and test set.

# For demo purposes we disable significance testing by setting k to 0.
# (Significance testing works but is currently very slow.) Note that the
# optimization procedure uses sampling, so results can change across different
# runs.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'seg', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, stats.KendallWithTiesOpt, 'item', 0, psd, pval, variant='acc23',
    sample_rate=0.1)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate a new metric

# New metrics can be included in the comparison of existing metrics using the
# 'extern_metrics' argument to GetCorrelations(). To demonstrate this, we'll
# create and evaluate a new metric consisting of the average of the top 3
# metrics in the system-level Pearson ranking.

# The result is a slight, non-significant, improvement over C-SPECpn, the metric
# with highest Pearson correlation. (The '*' before the new metric indicates
# that it isn't recognized as a primary submission.)

evs = all_evs['wmt21.news/en-de']

# Create the new metric
top3_metrics = ['C-SPECpn-refC', 'COMET-QE-MQM_2021-src', 'bleurt-20-refC']
sys_scores = {}
for sys_name in evs.sys_names:
  if sys_name == 'refC': continue
  scores = np.array([evs.Scores('sys', m)[sys_name] for m in top3_metrics])
  sys_scores[sys_name] = scores.mean(axis=0)

# Run the comparison with the new metric included via the 'extern_metrics'
# argument.
extras = {'top3_avg-refC': sys_scores}
corrs = data.GetCorrelations(
    evs, 'sys', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate a new metric using global accuracy

# This requires a bit more work, since we have to produce results for multiple
# evalsets. As before, the result is a slight gain over the best single metric
# (note that the averaged metrics aren't quite the top 3 for the global accuracy
# task).

# Create the new metric, one instance per input evalset
top3_metrics = ['C-SPECpn-<REF>', 'COMET-QE-MQM_2021-src', 'bleurt-20-<REF>']
extras_list = []
for evs in evs_list:
  top3 = [m.replace('<REF>', evs.std_ref) for m in top3_metrics]
  sys_scores = {}
  for sys_name in evs.sys_names:
    if sys_name == evs.std_ref: continue
    scores = np.array([evs.Scores('sys', m)[sys_name] for m in top3])
    sys_scores[sys_name] = scores.mean(axis=0)
  extras_list.append({f'top3_avg-{evs.std_ref}': sys_scores})

# Run the comparison with the new metric included via the 'extern_metrics_list'
# argument.
ranks, matrix = data.CompareMetricsWithGlobalAccuracy(
    evs_list, main_refs, close_refs, include_human, include_outliers,
    gold_name, primary_metrics, domain, k, psd, pval,
    extern_metrics_list=extras_list)

data.PrintMetricComparison(ranks, matrix)

# Ranking metrics using the task interface

This is a higher-level interface designed to make it more convenient to compare
a set of metrics using various different criteria called 'tasks'. The following
code uses this interface to roughly duplicate the comparisons in the previous
section.

In [None]:
# @title Define a set of tasks

# Create TaskSets from dicts that specify attribute/value-list combinations,
# along with fixed assignments to other attributes. Concatenate these into a
# single TaskSet.

k = 1  # Use only a single random draw for demo.
lang0 = {'test_set': ['wmt21.news'], 'lang': ['en-de,en-ru,zh-en']}
langs = {'test_set': ['wmt21.news'], 'lang': ['en-de', 'en-ru', 'zh-en']}

taskset = tasks.TaskSet(
    lang0, corr_fcn='accuracy', close_refs=[{'refB'}, set(), set()], k=k)
taskset += tasks.TaskSet(langs, level='sys', corr_fcn='pearson', k=k)
taskset += tasks.TaskSet(langs, level='seg', corr_fcn='pearson', k=k)
taskset += tasks.TaskSet(
    langs, level='seg', avg_by='item', corr_fcn='KendallWithTiesOpt',
    perm_test='pairs', corr_fcn_args={'sample_rate':0.01}, k=k)

# A TaskSet is just a list of Tasks, so we can make arbitrary changes to
# attribute values. In this case, set the correct close_refs for en-de tasks.

for task in taskset:
  if task.lang == 'en-de': task.close_refs = {'refB'}

# Print task 'names' (attribute/value strings in canonical order).

for t in taskset:
  print(t.name)

In [None]:
# @title Run the tasks

# This first loads the necessary data, then runs each task in sequence to
# produce a TaskSetResults object. Subsequent runs re-use the loaded data.

results = taskset.Run()  # Takes about 5 minutes.

In [None]:
# @title Print raw task results

for result in results:
  print(result.name)
  print(result.Str())

In [None]:
# @title Average ranks for metrics

# To combine the performance of metrics across tasks, we average their task
# ranks. The tasks are weighted to ensure that the total mass for important
# attributes is evenly distributed among the different values those attributes
# take on.
weights = results.AssignWeights(tasks.Attributes())
global_ranks = results.AverageRanks(weights)

# It is also interesting to compare the metric performance on different subsets
# of tasks, for instance split by language.
ranks_by_lp = {}
for val, subset in results.SplitByAttr('lang').items():
  weights = subset.AssignWeights(tasks.Attributes())
  ranks_by_lp[val] = subset.AverageRanks(weights)

# Print out the comparison, with global ranks first, followed by a breakdown
# by language pair. We only show metrics that are in the intersection of all
# tasks.
langs = [' all ' if lp == 'en-de,en-ru,zh-en' else lp for lp in ranks_by_lp]
print(''.rjust(24), 'global', ' '.join(langs))
for metric, rank in global_ranks.items():
  ranks_for_metric = [rank] + [d[metric] for d in ranks_by_lp.values()]
  print(f'{metric:<25}', ' '.join(f'{r:5.2f}' for r in ranks_for_metric))
