In [None]:
import json
from pathlib import Path

RESULTS_ROOT = Path("../results")
assert RESULTS_ROOT.exists()

In [None]:
from collections import defaultdict

results_by_method = defaultdict(dict)
for task, experiment_name in (
    ("decontextual", "icml_eval_bias_gen_dctx_gptj"),
    ("contextual", "icml_eval_bias_gen_gptj")
):
    for method, path in (
        ("baseline", "baseline.json"),
        (r"\ourmethod", "linear/11/error_correction_metrics.json"),
    ):
        results_file = RESULTS_ROOT / experiment_name / path
        assert results_file.exists(), results_file
        with results_file.open("r") as handle:
            results = json.load(handle)
        if method == "baseline":
            results = results["metrics"]
        results_by_method[method][task] = results

In [None]:
results_by_task_method

In [None]:
def to_interval(std):
    return 1.96 * std / 5000

def latexify(x):
    return f"${x}$"

def format_task(task):
    accuracy = task["top1_accuracy"]
    fluency_mean = task["fluency"]["mean"] * 100
    fluency_std = task["fluency"]["std"] * 100
    return (
        latexify(
            f"{accuracy:.2f}".lstrip("0"),
        ),
        latexify(
            f"{fluency_mean:.1f}"
#             + r" \pm "
#             + f"{to_interval(fluency_std):.2f}".lstrip("0")
        )
    )

def format_scores(results):
    scores_strs = (
        *format_task(results["contextual"]),
        *format_task(results["decontextual"]),
    )
    return scores_strs

def format_all(method, results_by_method):
    results = results_by_method[method]
    method = method.capitalize()
    return " & ".join([method, *format_scores(results)]) + r" \\"

print(format_all("baseline", results_by_method))
print(format_all(r"\ourmethod", results_by_method))