In [None]:
import os

from metrics import load_all_metrics
from benchmark_datasets import get_wmt_data
import pandas as pd
from datetime import datetime

In [None]:
# Create result dir
now = str(datetime.now())[:19]
os.makedirs(f"results/run_{now}", exist_ok=True)

In [None]:
# Load dataset and metrics
dataset = get_wmt_data()
metrics = load_all_metrics()
results = pd.DataFrame()
results["human_scores"] = dataset.human_scores

In [None]:
# Loop over metrics
for name, metric in metrics.items():
    scores = metric(dataset.references, dataset.candidates)
    for sub_metric_name, sub_score in scores.items():
        results[sub_metric_name] = sub_score

    # Save checkpoint, this is very useful in case of crash
    results.to_csv(f"results/run_{now}/checkpoint.csv", index=False)

In [None]:
# Save final result and delete checkpoints
columns_in_right_order = ["human_scores"] + [
    column for column in sorted(results.columns) if column != "human_scores"
]
results = results.reindex(columns_in_right_order, axis=1)
results.to_csv(f"results/run_{now}/final.csv", index=False)
os.remove(f"results/run_{now}/checkpoint.csv")

In [None]:
# Compute correlations and save them
os.makedirs(f"results/run_{now}/correlations", exist_ok=True)
for correlation_type in ["pearson", "spearman", "kendall"]:
    results.corr(method=correlation_type).to_csv(
        f"results/run_{now}/correlations/{correlation_type}.csv"
    )