In [21]:
from collections import defaultdict
import pandas as pd
import json

prompts, names, runs, scores = {}, {}, [], defaultdict(lambda: defaultdict(int))

with open("main.json.log", "r") as file:
    for line in file:
        item = json.loads(line)

        if item["type"] == "entity":
            names[item["key"]] = item["name"]
        elif item["type"] == "prompt":
            prompts[item["prompt_id"]] = item
        elif item["type"] == "score":
            scores[item["model"]][item["score"]] += 1
        elif item["type"] == "run" and "result" in item:
            result = item["result"]
            item["text"] = (result["choices"][0] if "choices" in result else result)["message"]["content"]
            item["len"] = len(item["text"]) + len(prompts[item["prompt_id"]]["prompt"])
            item["runner"] = "rtx-5090" if item["runner"].startswith("rtx-5090") else item["runner"]
            runs.append(item)

ok_models = set([model for model, verdicts in scores.items() if verdicts['ok'] / (verdicts['ok'] + verdicts['trash']) >= 0.95])
runs = [run for run in runs if run["model"] in ok_models]

df = pd.DataFrame(runs)

In [None]:
df["metric"] = df["len"] / df["elapsed_time"]

df_std = df.groupby(["runner", "model"])["metric"].agg(["median", "std", "count"])
df_std["combined"] = df_std.apply(lambda row: f"{row['median']:.0f}{'*' if row['std'] / row['median'] > 0.5 else ''}{'*' if row['count'] < 40 else ''}", axis=1)
table = df_std.reset_index().pivot(index="runner", columns="model", values="combined")

table.index.name = None
table.columns.name = None
name_order = list(names.keys())

def sort_key(value):
    return name_order.index(value) if value in name_order else float('inf')

table = table.loc[sorted(table.index, key=sort_key)]
table = table[sorted(table.columns, key=sort_key)]

table.index = table.index.map(lambda x: names.get(x, x))
table.columns = table.columns.map(lambda x: names.get(x, x))

table = table.applymap(lambda x: x if pd.notna(x) else "")

print(table.to_markdown())