In [21]:
import numpy as np
from itertools import permutations
import pandas as pd
from tqdm import tqdm

# Aggregating the results of different metrics

Some translation algorithms are multilingual by design. We focus on a specific case in which we are looking to evaluate the quality of traductions from a language to another, where these languages are not known. Knowing that the quality of metrics for NLG algorithms (in the sense of a correlation with human evaluation) depends on the languages that are translated, we would like to find a ranking of metrics that is robust to the different languages considered. 
We reverse the classical point of view of using fixed metrics to evaluate algorithms on different tasks. We can then use Kemeny consensus to evaluate the performance of metrics  on fixed tasks, with fixed datasets.

## Kemeny consensus

We use fixed datasets that are annotated and we apply different metrics to these datasets. We can the rank the metrics on these different datasets (same data for all metrics) according to the correlation of these metrics with human evaluation. Then we need to code a Kemeny ranking consensus to order the metrics according to their performances on different datasets.

In [2]:
def is_negative(n):
    if n < 0:
        return 1
    return 0


# Construction of a kendall distance function
def kendall_distance(l1, l2):
    if len(l1) != len(l2):
        raise ValueError("Permutations does not have the same size")
    result = 0
    for i in range(len(l1)):
        for j in range(len(l1)):
            result += is_negative((l1[i] - l1[j]) * (l2[i] - l2[j]))
    normalization = len(l1) * (len(l1) - 1)
    # normalization is made to make the distance be in [0,1]
    return result / normalization

In [3]:
print(kendall_distance([0, 1, 2], [0, 1, 2]))
print(kendall_distance([0, 1, 2], [2, 1, 0]))
print(kendall_distance([0, 1, 2], [1, 0, 2]))

0.0
1.0
0.3333333333333333


In [26]:
# When having a matrix of different rankings of metrics on datasets based on their correlations with human evaluation
def kemeny_consensus_step1(matrix, perm):
    # matrix is an array in which matrix[i,j] is the ranking of metrics j on the dataset i
    result = 0
    for i in range(matrix.shape[0]):
        result += kendall_distance(perm, matrix[i])
    return result


def kemeny_consensus(matrix):
    best_permutations = []
    dist = kemeny_consensus_step1(matrix, matrix[0])
    perm = permutations(matrix[0])
    for permutation in tqdm(list(perm), "Kemeny consensus"):
        dist_travail = kemeny_consensus_step1(matrix, permutation)
        if dist_travail < dist:
            best_permutations = []
            best_permutations += [np.array(permutation)]
            dist = dist_travail
        elif dist == dist_travail:
            best_permutations += [np.array(permutation)]
    return best_permutations, dist

In [27]:
matrix = np.array([[3, 2, 1, 0], [1, 2, 3, 0], [1, 3, 2, 0], [0, 1, 3, 2]])
perm1 = [0, 1, 2, 3]
print(kemeny_consensus_step1(matrix, perm1))
perm2 = [3, 2, 1, 0]
print(kemeny_consensus_step1(matrix, perm2))

2.333333333333333
1.6666666666666665


In [28]:
kemeny_consensus(matrix)

Kemeny consensus: 100%|██████████| 24/24 [00:00<00:00, 40281.43it/s]


([array([1, 3, 2, 0]), array([1, 2, 3, 0])], 1.0)

## Improvement of Kemeny consensus

All the rankings, given the different metrics have exactly the same weights. This might be a problem if we consider that some of the metrics are more relevant than others, or if some metrics are usually very close. So we can improve the Kemeny consensus by giving a higher weight to metrics that are more relevant

In [29]:
def weighted_kemeny_consensus_step1(matrix, perm, weights):
    # matrix is an array in which matrix[i,j] is the ranking of algorithm j at test i
    # weights[i] is the weight associated to test i
    result = 0
    for i in range(matrix.shape[0]):
        result += kendall_distance(perm, matrix[i]) * weights[i]
    return result


def weighted_kemeny_consensus(matrix, weights):
    best_permutations = []
    dist = weighted_kemeny_consensus_step1(matrix, matrix[0], weights)
    perm = permutations(matrix[0])
    for permutation in tqdm(list(perm), "Weighted kemeny consensus"):
        dist_travail = weighted_kemeny_consensus_step1(matrix, permutation, weights)
        if dist_travail < dist:
            best_permutations = []
            best_permutations += [np.array(permutation)]
            dist = dist_travail
        elif dist == dist_travail:
            best_permutations += [np.array(permutation)]
    return best_permutations, dist

In [30]:
weights = [0.7, 0.1, 0.1, 0.1]
weighted_kemeny_consensus(matrix, weights)

Weighted kemeny consensus: 100%|██████████| 24/24 [00:00<00:00, 39138.14it/s]


([array([3, 2, 1, 0])], 0.16666666666666669)

In [31]:
# From the correlations we can deduce
def using_indexed_assignment(x):
    result = np.empty(len(x), dtype=int)
    x = -x
    temp = x.argsort()
    result[temp] = np.arange(len(x))
    return result


def get_ranking(df):
    ranking = df
    for col in df.columns:
        ranking[col] = using_indexed_assignment(df[col])
    return ranking

# Testing our methods on real data

In [32]:
df_kendall_cs_en = (
    pd.read_csv("data/corr_kendall_2016_cs-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_pearson_cs_en = (
    pd.read_csv("data/corr_pearson_2016_cs-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_spearman_cs_en = (
    pd.read_csv("data/corr_spearman_2016_cs-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_kendall_de_en = (
    pd.read_csv("data/corr_kendall_2016_de-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_pearson_de_en = (
    pd.read_csv("data/corr_pearson_2016_de-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_spearman_de_en = (
    pd.read_csv("data/corr_spearman_2016_de-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_kendall_fi_en = (
    pd.read_csv("data/corr_kendall_2016_fi-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_pearson_fi_en = (
    pd.read_csv("data/corr_pearson_2016_fi-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_spearman_fi_en = (
    pd.read_csv("data/corr_spearman_2016_fi-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_kendall_ro_en = (
    pd.read_csv("data/corr_kendall_2016_ro-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_pearson_ro_en = (
    pd.read_csv("data/corr_pearson_2016_ro-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_spearman_ro_en = (
    pd.read_csv("data/corr_spearman_2016_ro-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_kendall_ru_en = (
    pd.read_csv("data/corr_kendall_2016_ru-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_pearson_ru_en = (
    pd.read_csv("data/corr_pearson_2016_ru-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)
df_spearman_ru_en = (
    pd.read_csv("data/corr_spearman_2016_ru-en.csv")
    .set_index("Unnamed: 0")
    .abs()
    .drop(["depth", "rouge2", "human_scores"], axis=0)
)

In [33]:
ranking_kendall_cs = get_ranking(
    df_kendall_cs_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "kendall_cs"}
    )
).transpose()
ranking_pearson_cs = get_ranking(
    df_pearson_cs_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "pearson_cs"}
    )
).transpose()
ranking_spearman_cs = get_ranking(
    df_spearman_cs_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "spearman_cs"}
    )
).transpose()
ranking_kendall_de = get_ranking(
    df_kendall_de_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "kendall_de"}
    )
).transpose()
ranking_pearson_de = get_ranking(
    df_pearson_de_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "pearson_de"}
    )
).transpose()
ranking_spearman_de = get_ranking(
    df_spearman_de_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "spearman_de"}
    )
).transpose()
ranking_kendall_fi = get_ranking(
    df_kendall_fi_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "kendall_fi"}
    )
).transpose()
ranking_pearson_fi = get_ranking(
    df_pearson_fi_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "pearson_fi"}
    )
).transpose()
ranking_spearman_fi = get_ranking(
    df_spearman_fi_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "spearman_fi"}
    )
).transpose()
ranking_kendall_ro = get_ranking(
    df_kendall_ro_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "kendall_ro"}
    )
).transpose()
ranking_pearson_ro = get_ranking(
    df_pearson_ro_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "pearson_ro"}
    )
).transpose()
ranking_spearman_ro = get_ranking(
    df_spearman_ro_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "spearman_ro"}
    )
).transpose()
ranking_kendall_ru = get_ranking(
    df_kendall_ru_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "kendall_ru"}
    )
).transpose()
ranking_pearson_ru = get_ranking(
    df_pearson_ru_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "pearson_ru"}
    )
).transpose()
ranking_spearman_ru = get_ranking(
    df_spearman_ru_en.loc[:, ["human_scores"]].rename(
        columns={"human_scores": "spearman_ru"}
    )
).transpose()

In [34]:
ranking_kendall_ro

Unnamed: 0,bary,bertscore,bleu,chrf,meteor,rouge1,rougeL,sacrebleu,ter
Kendall_ro,1,0,8,2,3,5,4,6,7


In [35]:
ranking_pearson_ro

Unnamed: 0,bary,bertscore,bleu,chrf,meteor,rouge1,rougeL,sacrebleu,ter
Pearson_ro,1,0,8,2,3,5,4,6,7


In [36]:
ranking_spearman_ro

Unnamed: 0,bary,bertscore,bleu,chrf,meteor,rouge1,rougeL,sacrebleu,ter
Spearman_ro,1,0,8,2,3,5,4,6,7


In [37]:
kendall_scores = pd.concat(
    [
        ranking_kendall_cs,
        ranking_kendall_de,
        ranking_kendall_fi,
        ranking_kendall_ro,
        ranking_kendall_ru,
    ],
    axis=0,
)

pearson_scores = pd.concat(
    [
        ranking_pearson_cs,
        ranking_pearson_de,
        ranking_pearson_fi,
        ranking_pearson_ro,
        ranking_pearson_ru,
    ],
    axis=0,
)


spearman_scores = pd.concat(
    [
        ranking_spearman_cs,
        ranking_spearman_de,
        ranking_spearman_fi,
        ranking_spearman_ro,
        ranking_spearman_ru,
    ],
    axis=0,
)

# Results

In [38]:
kemeny_consensus(spearman_scores.to_numpy())

Kemeny consensus: 100%|██████████| 362880/362880 [00:44<00:00, 8142.92it/s]


([array([1, 0, 8, 4, 3, 5, 2, 7, 6])], 0.2222222222222222)

In [39]:
kemeny_consensus(pearson_scores.to_numpy())

Kemeny consensus: 100%|██████████| 362880/362880 [00:44<00:00, 8172.65it/s]


([array([1, 0, 8, 3, 4, 5, 2, 6, 7])], 0.2222222222222222)

In [40]:
kemeny_consensus(kendall_scores.to_numpy())

Kemeny consensus: 100%|██████████| 362880/362880 [00:44<00:00, 8137.98it/s]


([array([1, 0, 8, 4, 3, 5, 2, 7, 6])], 0.3055555555555556)