Colab to reproduce results from the WMT23 metrics shared task

## Dependencies

In [None]:

# @title Install MTME

!git clone https://github.com/google-research/mt-metrics-eval.git && cd mt-metrics-eval && pip install .

In [None]:
# @title Imports

from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import tasks

In [None]:
# @title Download data

data.Download()  # Copies about 2G onto local machine.

## Reproduce official results

In [None]:
# @title Generate main results

# Generate main results for primary metrics.

# Setting k=0 suppresses significance testing. Results in the paper were
# generated with k=1000, which is too slow to run sequentially in a colab.
main_tasks, main_task_weights = tasks.WMT23(k=0)

# Task names show attributes that define each task.
for i, task in enumerate(main_tasks):
  print(f'task{i + 1}: {task.name}')

# Takes about 3 minutes.
main_results = main_tasks.Run()

In [None]:
# @title Display main results

# This reproduces Tables 8 and 9 from the shared task paper, modulo signficance
# results.

# AverageCorrMatrix produces significance clusters and pairwise p-values for the
# overall average correlation, but requires that the tasks be run with k > 0.
# AverageCorrs computes the same averages as AverageCorrMatrix but without
# significance.
avg_corrs = main_results.AverageCorrs(main_task_weights)
# avg_corrs, matrix = main_results.AverageCorrMatrix(main_task_weights)

# Use fmt='tsv' to generate tsv format for spreadsheets. This function has
# many other options to customize output.
table = main_results.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t'},
    fmt='text',
    baselines_metainfo=meta_info.WMT23)
print(table)


In [None]:
# @title Generate full results

# Identical to main results except we include contrastive  metric submissions.

main_tasks_full, _ = tasks.WMT23(k=0, primary=False)

# Takes about 5 minutes.
main_results_full = main_tasks_full.Run()

In [None]:
# @title Display full results.

# This reproduces results from Tables 16 and 17 in the paper.

avg_corrs = main_results_full.AverageCorrs(main_task_weights)

# Leading *s indicate contrastive submissions, leading _s indicate baselines.
table = main_results_full.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t'},
    fmt='text',
    which_metrics='union',
    baselines_metainfo=meta_info.WMT23)
print(table)


In [None]:
# @title Generate DA results

# Results for all metrics using DA-SQM instead of MQM as gold scores.

# DA scores are available for a wider set of languages than the ones used for
# the main evaluation. Only en-de and zh-en are common to both.
da_lps = ['cs-uk', 'de-en', 'en-cs', 'en-de', 'en-ja', 'en-zh', 'ja-en' 'zh-en']
da_tasks, da_wts = tasks.WMT23(k=0, primary=False, lps=da_lps, gold='da-sqm')

for task in da_tasks:
  print(task.name)

# Takes about 15 minutes.
da_results = da_tasks.Run()

In [None]:
# @title Display DA results

# This reproduces results from tables 19 to 27 in the paper.

avg_corrs = da_results.AverageCorrs(da_wts)
all_da_lps = ','.join(sorted(da_lps))

table = da_results.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t', all_da_lps: 'all'},
    fmt='text',
    which_metrics='union',
    baselines_metainfo=meta_info.WMT23)
print(table)


In [None]:
# @title Accuracy results, MQM vs DA

# This reproduces results from table 14 in the paper. Note that the two columns
# are not comparable because they are computed on different sets of languages
# (in addition to using different gold scores).

acc_mqm = main_results.SplitByAttr('corr_fcn')['accuracy']
acc_da = da_results.SplitByAttr('corr_fcn')['accuracy']
acc_mqm_vs_da = acc_mqm + acc_da

table = acc_mqm_vs_da.Table(
    attr_list=['lang'],
    nicknames={all_da_lps: 'all-DA-lps'},
    rerank=[True, True],
    which_metrics='intersection',
    baselines_metainfo=meta_info.WMT23)
print(table)

# Evaluate a new metric

This section shows a worked example of evaluating a new metric online. Another
possibility is to generate scores offline, write score files to disk, and use
EvalSet.AddMetricsFromDir() to read them in.

In [None]:
# @title Define the metric

import numpy as np

# Replace this function with your own metric.

def NewMetric(
    level: str,
    lp: str,
    domains: dict[str, list[list[int]]],
    docs: dict[str, list[int]],
    src: list[str],
    ref: list[str],
    hyps: dict[list[str]]
) -> dict[str, list[float]]:
  """
  Generate metric scores.

  Args:
    level: Level for which to produce scores, 'sys' or 'seg'.
    lp: Language pair, eg 'en-de'.
    domains: Map from domain name to [[beg, end+1], ...] segment position lists.
    docs: Map from doc name to [beg, end+1] segment positions.
    src: List of source segments.
    ref: List of reference segments.
    hyps: Map from MT system name to output segments for that system.

  Returns:
    Map from system name to scores, a list of segment-level scores if level is
    'seg', or a list containing a single score if level is 'sys'.
  """
  # Sample metric just computes a length match between each hypothesis and the
  # reference. It ignores lp, domains, docs, and source.

  del lp, domains, docs, src

  ref_lens = np.array([len(r) for r in ref])
  scores = {}
  for sysname, hyp in hyps.items():
    hyp_lens = np.array([len(h) for h in hyp])
    deltas = np.abs(ref_lens - hyp_lens) / (ref_lens + 1)
    scores[sysname] = -deltas if level == 'seg' else [-deltas.mean()]

  return scores

In [None]:
# @title Load EvalSets

wmt23_lps = ['en-de', 'he-en', 'zh-en']
evs_dict = {('wmt23', lp): data.EvalSet('wmt23', lp, True) for lp in wmt23_lps}

In [None]:
# @title Add metric scores to EvalSets

# Compute scores for each language pair, and add to the appropriate EvalSet.
# Setting replace=True makes this work if we want to iterate over different
# versions of the metric.

metric_name = 'lendiff'

for lp in wmt23_lps:
  evs = evs_dict[('wmt23', lp)]
  for refname, ref in evs.all_refs.items():
    sys_scores = NewMetric(
        'sys', evs.lp, evs.domains, evs.docs, evs.src, ref, evs.sys_outputs)
    seg_scores = NewMetric(
        'seg', evs.lp, evs.domains, evs.docs, evs.src, ref, evs.sys_outputs)
    evs.AddMetric(metric_name, {refname}, 'sys', sys_scores, replace=True)
    evs.AddMetric(metric_name, {refname}, 'seg', seg_scores, replace=True)

# Add new metric to the primary lists, so it will get picked up when tasks get
# run with primary=True (avoiding having to evaluate all contrastive
# submissions as well).

for evs in evs_dict.values():
  evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})

In [None]:
# @title Generate results with new metric

# For a first pass we turn off significance testing.

wmt23_tasks, wts = tasks.WMT23(wmt23_lps, k=0)

# Takes about 3 minutes.
new_results = wmt23_tasks.Run(eval_set_dict=evs_dict)

In [None]:
# @title Print results

# Results show all primary metrics, along with the new 'lendiff' metric.

avg_corrs = new_results.AverageCorrs(wts)

table = new_results.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t'},
    fmt='text',
    baselines_metainfo=meta_info.WMT23)

print(table)


In [None]:
# @title Compare with significance

# For speed reasons, limit comparison to the two metrics that bracket lendiff
# in the average-correlation ranking.
for evs in evs_dict.values():
  evs.SetPrimaryMetrics({'Random-sysname', 'lendiff', 'eBLEU'})

# Run the significance test. Set k=1000 for a more realistic comparison. This
# takes about 2 minutes with k=50.
wmt23_tasks, wts = tasks.WMT23(wmt23_lps, k=50)
new_results = wmt23_tasks.Run(eval_set_dict=evs_dict)


In [None]:
# @title Print significance results

avg_corrs, matrix = new_results.AverageCorrMatrix(main_task_weights)

table = new_results.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t'},
    fmt='text',
    baselines_metainfo=meta_info.WMT23)

# The table indicates that lendiff and eBLEU are in the same significance
# cluster ahead of Random-sysname.
print(table)
print()

# Print the p-value matrix for the three pairwise comparisons used to assign
# significance clusters.
print(tasks.MatrixString(avg_corrs, matrix, probs=True))
