In [1]:
import os
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from scipy.stats import kendalltau
import numpy as np

In [2]:
files = [] 
for file in os.listdir():
    if file.endswith(".xlsx"):
        files.append(file)

In [6]:
results = []
for language in ["es", "pt", "fr", "en", "de"]:
    print(language)
    language_files = [
        file for file in filter(
            lambda x: x.startswith(language), files
        )
    ]
    # print(language_files)
    if len(language_files) > 1:
        dfs_0 = pd.read_excel(language_files[0], sheet_name=None)
        dfs_1 = pd.read_excel(language_files[1], sheet_name=None)
        
        for name in ["llama", "mistral", "original"]:
            if name not in dfs_0 or name not in dfs_1:
                continue
            a = dfs_0[name]["Human Eval (1-5)"].to_list()
            b = dfs_1[name]["Human Eval (1-5)"].to_list()
            mean = (np.mean(a) + np.mean(b))/2
            try:
                kappa = cohen_kappa_score(a, b)
            except Exception as e:
                kappa = None
            try:
                tau = kendalltau(a, b, nan_policy='omit').statistic
            except Exception as e:
                tau = None
            results.append({
                "language": language,
                "model": name,
                "exact_match": (np.array(a) == np.array(b)).sum()/len(a),
                "kappa": kappa,
                "tau": tau,
                "mean": mean,
            })
    else:
        dfs_0 = pd.read_excel(language_files[0], sheet_name=None)
        
        for name in ["llama", "mistral", "original"]:
            if name not in dfs_0 or name not in dfs_1:
                continue
            a = dfs_0[name]["Human Eval (1-5)"].to_list()
            mean = np.mean(a)
            results.append({
                "language": language,
                "model": name,
                "mean": mean,
            })
        

es
pt
fr
en
de


In [7]:
df = pd.DataFrame.from_dict(results)

In [12]:
df.round(3)

Unnamed: 0,language,model,exact_match,kappa,tau,mean
0,es,llama,0.65,0.491,0.779,2.285
1,es,mistral,0.905,0.822,0.929,1.762
2,es,original,0.87,0.4,0.413,4.87
3,pt,llama,0.625,0.346,0.521,1.885
4,pt,mistral,0.765,0.243,0.362,1.365
5,pt,original,0.985,0.972,0.973,3.282
6,fr,llama,0.37,0.206,0.491,2.755
7,fr,mistral,0.575,0.407,0.616,2.125
8,fr,original,0.98,-0.005,-0.009,4.985
9,en,llama,,,,4.83
