In [1]:
import json
import pandas as pd

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
root_dir = "../TowerEval-Data-v0.1/evaluations/0_shot_tower_instruct/mt"

models_dict = {
    "TowerInstruct-7B-v0.2": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-DPO-BASE-xCOMET-Ensemble-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-DPO-BASE-SFT-xCOMET-Ensemble-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-CPO-xcomet-ensemble-inc7b-fix": {"framework": "vllm"},
    "TowerInstruct-13B-v0.2": {"framework": "vllm"},
    "TowerInstruct-13B-v0.2-SFT-xCOMET-Ensemble": {"framework": "vllm"},
    "TowerInstruct-13B-v0.2-CPO-xcomet-ensemble-inc7b-fix": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-CPO-6lp-xCOMET-Ensemble-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-CPO-xCOMET-KIWI-inc7b": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-CPO-ALMA-R-6k": {"framework": "vllm"},
    "TowerInstruct-7B-v0.2-CPO-ALMA-R": {"framework": "vllm"},

    "google":{"framework": "vllm"},
    "gpt-4":{"framework": "vllm"},
    "gpt-3.5-turbo":{"framework": "vllm"},
    "alma-r":{"framework": "vllm"},

}

datasets = { 
    "wmt23": ["en-de",  "en-zh",  "en-ru","de-en","zh-en", "ru-en"],
    # "flores": ["en-de", "en-zh", "en-ru", "en-pt",  "en-it", "en-fr",  "en-ko","en-es", "en-nl", "de-en",  "zh-en",   "ru-en", "pt-en", "it-en", "fr-en", "ko-en", "es-en",  "nl-en"],
}

metrics = ["chrf", "comet", "xcomet_xxl", "metricx_xxl"]

data_dict = {"model": [], "dataset": [], "lp": [], "framework": []}

for model, model_dict in models_dict.items():
    for dataset in datasets.keys():
        for lp in datasets[dataset]:
            # try:
                print(f"Processing {model} {dataset} {lp}")
                with open(
                    f'{root_dir}/{dataset}.{lp}/{model_dict["framework"]}/{model}/evaluation.json',
                    "r",
                ) as f:
                    data = json.load(f)
                for metric in metrics:
                    if metric not in data_dict.keys():
                        data_dict[metric] = []
                    if metric in data:
                        data_dict[metric].append(data[metric])
                    else:
                        data_dict[metric].append(None)
                
                data_dict["model"].append(model)
                data_dict["dataset"].append(dataset)
                data_dict["lp"].append(lp)
                data_dict["framework"].append(model_dict["framework"])
            # except:
            #     continue

df = pd.DataFrame().from_dict(data_dict)

Processing TowerInstruct-7B-v0.2 wmt23 en-de
Processing TowerInstruct-7B-v0.2 wmt23 en-zh
Processing TowerInstruct-7B-v0.2 wmt23 en-ru
Processing TowerInstruct-7B-v0.2 wmt23 de-en
Processing TowerInstruct-7B-v0.2 wmt23 zh-en
Processing TowerInstruct-7B-v0.2 wmt23 ru-en
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 en-de
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 en-zh
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 en-ru
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 de-en
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 zh-en
Processing TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b wmt23 ru-en
Processing TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b wmt23 en-de
Processing TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b wmt23 en-zh
Processing TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b wmt23 en-ru
Processing TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b wmt23 de-en


In [4]:
df["comet"] = df["comet"]*100
df["xcomet_xxl"] = df["xcomet_xxl"]*100


In [None]:
# %%
import numpy as np


for dataset in datasets:
    print(f" ################## {dataset} ################")
    lps_str =(" ").join([f" chrf_{lp} comet_{lp} xcomet_{lp} metricx_{lp}" for lp in datasets[dataset]])

    print(f'{"name":<60} chrf_en_xx comet_en_xx xcomet_en_xx metricx_en_xx chrf_xx_en comet_xx_en xcomet_xx_en metricx_xx_en')
    for model_name in df.model.unique():
        scores = ""
        for (x,y,z, m) in zip(*df[df.model==model_name][["chrf", "comet", "xcomet_xxl", "metricx_xxl"]].values.T.tolist()):
            scores+=f"{x} {y:.2f} {z:.2f} {m:.2f} "

        scores = list(map(float, scores[:-1].split(" ")))
        # print(len(scores))
        score_str = (" ").join(map(str, scores))
        chrf_en_xx = np.mean([scores[i] for i in range(0, len(scores)//2, 4)])
        comet_en_xx = np.mean([scores[i] for i in range(1, len(scores)//2, 4)])
        xcomet_en_xx = np.mean([scores[i] for i in range(2, len(scores)//2, 4)])
        metricx_en_xx = np.mean([scores[i] for i in range(3, len(scores)//2, 4)])

        chrf_xx_en = np.mean([scores[i] for i in range(len(scores)//2, len(scores), 4)])
        comet_xx_en = np.mean([scores[i] for i in range(len(scores)//2+1, len(scores), 4)])
        xcomet_xx_en = np.mean([scores[i] for i in range(len(scores)//2+2, len(scores), 4)])
        metricx_xx_en = np.mean([scores[i] for i in range(len(scores)//2+3, len(scores), 4)])
        print(f"{model_name:<20} & ${chrf_en_xx:.2f}$ & ${comet_en_xx:.2f}$ & ${xcomet_en_xx:.2f}$ & ${metricx_en_xx:.2f}$ & &&  ${chrf_xx_en:.2f}$ & ${comet_xx_en:.2f}$ & ${xcomet_xx_en:.2f}$ & ${metricx_xx_en:.2f}$ & ")


 ################## wmt23 ################
name                                                         chrf_en_xx comet_en_xx xcomet_en_xx metricx_en_xx chrf_xx_en comet_xx_en xcomet_xx_en metricx_xx_en
TowerInstruct-7B-v0.2 & $52.25$ & $84.32$ & $85.32$ & $1.78$ & &&  $58.87$ & $82.77$ & $88.77$ & $2.20$ & 
TowerInstruct-7B-v0.2-SFT-xCOMET-Ensemble-inc7b & $53.29$ & $84.26$ & $85.11$ & $1.92$ & &&  $59.30$ & $82.79$ & $89.16$ & $2.29$ & 
TowerInstruct-7B-v0.2-DPO-1e-7-xCOMET-Ensemble-inc7b & $53.27$ & $84.85$ & $85.63$ & $1.73$ & &&  $59.86$ & $83.18$ & $89.56$ & $2.13$ & 
TowerInstruct-7B-v0.2-DPO-BASE-xCOMET-Ensemble-inc7b & $49.90$ & $84.64$ & $86.14$ & $1.44$ & &&  $58.34$ & $83.05$ & $89.73$ & $1.87$ & 
TowerInstruct-7B-v0.2-DPO-BASE-SFT-xCOMET-Ensemble-inc7b & $52.42$ & $84.99$ & $86.37$ & $1.58$ & &&  $59.43$ & $83.16$ & $89.60$ & $2.03$ & 
TowerInstruct-7B-v0.2-CPO-xcomet-ensemble-inc7b-fix & $52.95$ & $85.05$ & $86.43$ & $1.59$ & &&  $59.62$ & $83.14$ & $89.70$ & $2.04$ & 
T