# Demo colab for mt_metrics_eval

In [None]:
# Imports

import numpy as np
import scipy.stats
from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import stats

In [None]:
# @title Print all available evalsets, load selected wmt21 sets

all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
  print(f'{testset}:', ' '.join(lp for lp in meta_info.DATA[testset]))
  if testset in ('wmt21.news', 'wmt21.tedtalks'):
    for lp in meta_info.DATA[testset]:
      evs = data.EvalSet(testset, lp, True)
      all_evs[f'{testset}/{lp}'] = evs

In [None]:
# @title Print info for all loaded evalsets

print(f'{"name":<20}  segs sys metrics gold  refs std')
for name, evs in all_evs.items():
  nsegs = len(evs.src)
  nsys = len(evs.sys_names)
  nmetrics = len(evs.metric_basenames)
  gold = evs.StdHumanScoreName('sys')
  nrefs = len(evs.ref_names)
  std_ref = evs.std_ref

  print(f'{name:<20} {nsegs:5d} {nsys:3d} {nmetrics:7d} '
        f'{gold:5} {nrefs:4d} {std_ref}') 

In [None]:
# @title Correlations and significance matrix for wmt21.news en-de
# System-level Pearson MQM correlations and significance matrix for with human
# translations included in scoring, using primary metric submissions only.
# Takes about 20s due to bootstrapping for significance tests.

# Get map from metric-name -> 'Correlation' objects containing sufficient stats.
evs = all_evs['wmt21.news/en-de']
level = 'sys'
corrs = data.GetCorrelations(
    evs=evs,
    level=level,
    main_refs={evs.std_ref},
    close_refs={'refB'},
    include_human=True,
    include_outliers=False,
    gold_name=evs.StdHumanScoreName(level),
    primary_metrics=True)

# Compute Pearson correlations and pairwise significance matrix.
corr_map, sig_matrix = data.CompareMetrics(corrs, scipy.stats.pearsonr)

print('System-level +HT Pearson correlations and ranks for wmt21.news en-de:') 
for m, (corr_val, rank) in corr_map.items():
  print(f'{m:<21} {corr_val: 0.3f} {rank}')
print()
print('Significant differences in Pearson correlation:')
n = len(corr_map)
for i in range(n):
  better = ['>' if sig_matrix[i, j] < 0.05 else '=' for j in range(i + 1, n)]
  better = ['.'] * (n - len(better)) + better
  print(f'{ranked_metrics[i]:<22} {" ".join(better)}')


In [None]:
# @title Rank-wise comparison of metrics across tasks

# Selecting only domains & language pairs for which we have MQM scores.
# Note: Using a low k value so tests finish faster. Set k higher for more stable
# conclusions.

tasks = []  # list of task names
aggregate_ranks = {}  # metric -> list of ranks across tasks

for domain in 'wmt21.news', 'wmt21.tedtalks':
  for lp in 'en-de', 'en-ru', 'zh-en':
    evs_name = f'{domain}/{lp}'
    evs = all_evs[evs_name]
    close_refs = {'refB'} if evs_name == 'wmt21.news/en-de' else set()
    for level in 'sys', 'seg':
      for avg in 'none', 'sys':
        if level == 'sys' and avg == 'sys': continue
        for human in True, False:
          if human == True and domain == 'wmt21.tedtalks': continue
          for corr_fcn in scipy.stats.pearsonr, scipy.stats.kendalltau:
            task_name = f'{evs_name}.{level}.avg_by_{avg}'
            task_name += f'.{"HT" if human else "noHT"}'
            task_name += f'.{corr_fcn.__name__}'
            corrs = data.GetCorrelations(
                evs=evs,
                level=level,
                main_refs={evs.std_ref},
                close_refs=close_refs,
                include_human=human,
                include_outliers=False,
                gold_name='mqm',
                primary_metrics=True)
            tasks.append(task_name)

            corrs_and_ranks, _ = data.CompareMetrics(
                corrs, corr_fcn, average_by=avg, k=5, pval=0.05)
            for m in corrs_and_ranks:
              name, metric_refs = evs.ParseMetricName(m)
              if not metric_refs:
                name = m  # Distinguish reference-free versions of metrics.
              if name not in aggregate_ranks:
                aggregate_ranks[name] = []
              missing_entries = len(tasks) - 1 - len(aggregate_ranks[name])
              aggregate_ranks[name] += [None] * missing_entries
              aggregate_ranks[name].append(corrs_and_ranks[m][1])
            print('.', end='')
print()

for m in aggregate_ranks:
  aggregate_ranks[m] += [None] * (len(tasks) - len(aggregate_ranks[m]))

def avg_no_nones(scores):
  scores = [s for s in scores if s is not None]
  return sum(scores) / len(scores)

aggregate_ranks = dict(
    sorted(aggregate_ranks.items(), key=lambda x: avg_no_nones(x[1])))

print('task key:')
for i, t in enumerate(tasks):
  print(f'{i:<2} {t}')
print()

for m, ranks in aggregate_ranks.items():
  n = sum(r is not None for r in ranks)
  avg = avg_no_nones(ranks)
  ranks_str = ' '.join(['  ' if r is None else f'{r:2d}' for r in ranks])
  print(f'{m:<22} {avg:6.3f} ({n} tasks) {ranks_str}')
