In [1]:
import numpy as np
from scipy .stats import kendalltau
from scipy .stats import rankdata
from itertools import permutations
import pandas as pd

# Aggregating the results of different metrics

Some translation algorithms are multilingual by design. We focus on a specific case in which we are looking to evaluate the quality of traductions from a language to another, where these languages are not known. Knowing that the quality of metrics for NLG algorithms (in the sense of a correlation with human evaluation) depends on the languages that are translated, we would like to find a ranking of metrics that is robust to the different languages considered. 
We reverse the classical point of view of using fixed metrics to evaluate algorithms on different tasks. We can then use Kemeny consensus to evaluate the performance of metrics  on fixed tasks, with fixed datasets.

## Kemeny consensus

We use fixed datasets that are annotated and we apply different metrics to these datasets. We can the rank the metrics on these different datasets (same data for all metrics) according to the correlation of these metrics with human evaluation. Then we need to code a Kemeny ranking consensus to order the metrics according to their performances on different datasets.

In [2]:
def is_negative(n) :
    if n < 0:
        return 1
    return 0

#Construction of a Kendall distance function
def Kendall_distance(l1, l2):
    if len(l1) != len(l2):
        raise ValueError("Les permutations n'ont pas la même longueur")
    result = 0
    for i in range (len(l1)):
        for j in range (len(l1)):
            result += is_negative((l1[i]-l1[j])*(l2[i]-l2[j]))
    normalization = len(l1)*(len(l1)-1)
    #normalization is made make the distance be in [0,1]
    return result/normalization

In [3]:
print(Kendall_distance([0, 1, 2], [0, 1, 2]))
print(Kendall_distance([0, 1, 2], [2, 1, 0]))
print(Kendall_distance([0, 1, 2], [1, 0, 2]))

0.0
1.0
0.3333333333333333


In [4]:
#When having a matrix M of different rankings of metrics on datasets based on their correlations with human evaluation
def Kemeny_consensus_step1(M, perm):
    #M is an array in which M[i,j] is the ranking of metrics j on the dataset i
    result = 0
    for i in range (M.shape[0]):
        result += Kendall_distance (perm, M[i])
    return result

def Kemeny_consensus(M) : 
    L = []
    dist = Kemeny_consensus_step1(M, M[0])
    perm = permutations(M[0])
    for permutation in list(perm) : 
        dist_travail = Kemeny_consensus_step1(M, permutation)
        if dist_travail < dist:
            L=[]
            L += [np.array(permutation)]
            dist = dist_travail
        elif dist == dist_travail:
            L += [np.array(permutation)]
    return L, dist

In [5]:
M = np.array([[3, 2, 1, 0], [1, 2, 3, 0], [1, 3, 2, 0], [0, 1, 3, 2]])
perm1 = [0, 1, 2, 3]
print (Kemeny_consensus_step1(M, perm1))
perm2 = [3, 2, 1, 0]
print (Kemeny_consensus_step1(M, perm2))


2.333333333333333
1.6666666666666665


In [6]:
Kemeny_consensus(M)

([array([1, 3, 2, 0]), array([1, 2, 3, 0])], 1.0)

## Improvement of Kemeny consensus

All the rankings, given the different metrics have exactly the same weights. This might be a problem if we consider that some of the metrics are more relevant than others, or if some metrics are usually very close. So we can improve the Kemeny consensus by giving a higher weight to metrics that are more relevant

In [7]:
def Weighted_Gameny_consensus_step1(M, perm, Weights):
    #M is an array in which M[i,j] is the ranking of algorithm j at test i
    #Weights[i] is the weight associated to test i
    result = 0
    for i in range (M.shape[0]):
        result += Kendall_distance (perm, M[i]) * Weights[i]
    return result

def Weighted_Kemeny_consensus(M, Weights) : 
    L = []
    dist = Weighted_Gameny_consensus_step1(M, M[0], Weights)
    perm = permutations(M[0])
    for permutation in list(perm) : 
        dist_travail = Weighted_Gameny_consensus_step1(M, permutation, Weights)
        if dist_travail < dist:
            L=[]
            L += [np.array(permutation)]
            dist = dist_travail
        elif dist == dist_travail:
            L += [np.array(permutation)]
    return L, dist

In [8]:
Weights=[0.7, 0.1, 0.1, 0.1]
Weighted_Kemeny_consensus(M, Weights)

([array([3, 2, 1, 0])], 0.16666666666666669)

In [9]:
#From the correlations we can deduce 
def using_indexed_assignment(x):
    result = np.empty(len(x), dtype=int)
    x=-x
    temp = x.argsort()
    result[temp] = np.arange(len(x))
    return result

def get_ranking(df):
    ranking = df
    for col in df.columns:
        ranking[col] = using_indexed_assignment(df[col])
    return ranking

# Testing our methods on real data

In [10]:
df_Kendall_cs_en = pd.read_csv('data\corr_kendall_2016_cs-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Pearson_cs_en = pd.read_csv('data\corr_pearson_2016_cs-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Spearman_cs_en = pd.read_csv('data\corr_spearman_2016_cs-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Kendall_de_en = pd.read_csv('data\corr_kendall_2016_de-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Pearson_de_en = pd.read_csv('data\corr_pearson_2016_de-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Spearman_de_en = pd.read_csv('data\corr_spearman_2016_de-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Kendall_fi_en = pd.read_csv('data\corr_kendall_2016_fi-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Pearson_fi_en = pd.read_csv('data\corr_pearson_2016_fi-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Spearman_fi_en = pd.read_csv('data\corr_spearman_2016_fi-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Kendall_ro_en = pd.read_csv('data\corr_kendall_2016_ro-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Pearson_ro_en = pd.read_csv('data\corr_pearson_2016_ro-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Spearman_ro_en = pd.read_csv('data\corr_spearman_2016_ro-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Kendall_ru_en = pd.read_csv('data\corr_kendall_2016_ru-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Pearson_ru_en = pd.read_csv('data\corr_pearson_2016_ru-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)
df_Spearman_ru_en = pd.read_csv('data\corr_spearman_2016_ru-en.csv').set_index('Unnamed: 0').abs().drop(['depth', 'rouge2', 'human_scores'], axis=0)

In [11]:
ranking_Kendall_cs = get_ranking(df_Kendall_cs_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Kendall_cs'})).transpose()
ranking_Pearson_cs = get_ranking(df_Pearson_cs_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Pearson_cs'})).transpose()
ranking_Spearman_cs = get_ranking(df_Spearman_cs_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Spearman_cs'})).transpose()
ranking_Kendall_de = get_ranking(df_Kendall_de_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Kendall_de'})).transpose()
ranking_Pearson_de = get_ranking(df_Pearson_de_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Pearson_de'})).transpose()
ranking_Spearman_de = get_ranking(df_Spearman_de_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Spearman_de'})).transpose()
ranking_Kendall_fi = get_ranking(df_Kendall_fi_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Kendall_fi'})).transpose()
ranking_Pearson_fi = get_ranking(df_Pearson_fi_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Pearson_fi'})).transpose()
ranking_Spearman_fi = get_ranking(df_Spearman_fi_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Spearman_fi'})).transpose()
ranking_Kendall_ro = get_ranking(df_Kendall_ro_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Kendall_ro'})).transpose()
ranking_Pearson_ro = get_ranking(df_Pearson_ro_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Pearson_ro'})).transpose()
ranking_Spearman_ro = get_ranking(df_Spearman_ro_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Spearman_ro'})).transpose()
ranking_Kendall_ru = get_ranking(df_Kendall_ru_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Kendall_ru'})).transpose()
ranking_Pearson_ru = get_ranking(df_Pearson_ru_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Pearson_ru'})).transpose()
ranking_Spearman_ru = get_ranking(df_Spearman_ru_en.loc[:,['human_scores']].rename(columns= {'human_scores' : 'Spearman_ru'})).transpose()

In [12]:
ranking_Kendall_ro
ranking_Pearson_ro
ranking_Spearman_ro

Unnamed: 0,bary,bertscore,bleu,chrf,meteor,rouge1,rougeL,sacrebleu,ter
Spearman_ro,1,0,8,2,3,5,4,6,7


In [13]:
#pd.concat([ranking_Kendall_cs, ranking_Pearson_cs, ranking_Spearman_cs,
#           ranking_Kendall_de, ranking_Pearson_de, ranking_Spearman_de,
#           ranking_Kendall_fi, ranking_Pearson_fi, ranking_Spearman_fi,
#           ranking_Kendall_ro, ranking_Pearson_ro, ranking_Spearman_ro,
#           ranking_Kendall_ru, ranking_Pearson_ru, ranking_Spearman_ru], axis = 0)

Kendall_scores = pd.concat([ranking_Kendall_cs,
           ranking_Kendall_de,
           ranking_Kendall_fi,
           ranking_Kendall_ro,
           ranking_Kendall_ru], axis = 0)

Pearson_scores = pd.concat([ranking_Pearson_cs,
           ranking_Pearson_de,
           ranking_Pearson_fi,
           ranking_Pearson_ro,
           ranking_Pearson_ru,], axis = 0)


Spearman_scores = pd.concat([ranking_Spearman_cs,
           ranking_Spearman_de,
           ranking_Spearman_fi,
           ranking_Spearman_ro,
           ranking_Spearman_ru,], axis = 0)

# Results

In [14]:
Kemeny_consensus(Spearman_scores.to_numpy())

([array([1, 0, 8, 4, 3, 5, 2, 7, 6])], 0.2222222222222222)

In [15]:
Kemeny_consensus(Pearson_scores.to_numpy())

([array([1, 0, 8, 3, 4, 5, 2, 6, 7])], 0.2222222222222222)

In [16]:
Kemeny_consensus(Kendall_scores.to_numpy())

([array([1, 0, 8, 4, 3, 5, 2, 7, 6])], 0.3055555555555556)