In [1]:
import pandas as pd
import json

prompts, names, runs = {}, {}, []

with open("main.json.log", "r") as file:
    for line in file:
        item = json.loads(line)

        if item["type"] == "entity":
            names[item["key"]] = item["name"]
        elif item["type"] == "prompt":
            prompts[item["prompt_id"]] = item
        elif item["type"] == "run" and "result" in item:
            result = item["result"]
            item["text"] = (result["choices"][0] if "choices" in result else result)["message"]["content"]
            item["len"] = len(item["text"]) + len(prompts[item["prompt_id"]]["prompt"])
            runs.append(item)

df = pd.DataFrame(runs)

In [14]:
df["metric"] = df["len"] / df["elapsed_time"]

df_std = df.groupby(["runner", "model"])["metric"].agg(["median", "std"])
df_std["combined"] = df_std.apply(lambda row: f"{row['median']:.0f} (Δ{100 * row['std'] / row['median']:.0f}%)", axis=1)
table = df_std.reset_index().pivot(index="model", columns="runner", values="combined")

table.index.name = None
table.columns.name = None
name_order = list(names.keys())

def sort_key(value):
    return name_order.index(value) if value in name_order else float('inf')

table = table.loc[sorted(table.index, key=sort_key)]
table = table[sorted(table.columns, key=sort_key)]
table.index = table.index.map(lambda x: names.get(x, x))
table.columns = table.columns.map(lambda x: names.get(x, x))
table = table.applymap(lambda x: x if pd.notna(x) else "")

table

Unnamed: 0,OpenAI API,Mac M3 Max 128GB,RTX 4090 24GB,RTX 5090 32GB,RTX 5090 16-core CPU,RTX 5090 96-core CPU,RTX A6000 48GB,RTX 6000Ada 48GB,Dual RTX 4090 24GB,Dual RTX 5090 32GB,A100 SXM4 40GB,Mac M1 8GB
ChatGPT 4o,356 (Δ25%),,,,,,,,,,,
o1-mini,430 (Δ37%),,,,,,,,,,,
Llama 2 uncensored 7B (3.8 GB),,351 (Δ76%),790 (Δ129%),871 (Δ148%),,,580 (Δ97%),660 (Δ160%),660 (Δ198%),855 (Δ131%),748 (Δ154%),
DeepSeek-R1 7B (4.7 GB),,240 (Δ18%),665 (Δ22%),967 (Δ25%),,,437 (Δ24%),536 (Δ23%),615 (Δ23%),890 (Δ26%),623 (Δ20%),
Qwen2.5 14B (9.0 GB),,138 (Δ24%),381 (Δ30%),549 (Δ29%),,,255 (Δ31%),322 (Δ28%),372 (Δ37%),510 (Δ37%),368 (Δ29%),
Phi-4 14B (9.1 GB),,136 (Δ25%),418 (Δ31%),607 (Δ34%),,,280 (Δ37%),343 (Δ33%),411 (Δ34%),579 (Δ37%),405 (Δ36%),
DeepSeek-R1 32B (19 GB),,64 (Δ20%),185 (Δ22%),284 (Δ21%),,,129 (Δ23%),151 (Δ23%),170 (Δ24%),234 (Δ22%),185 (Δ23%),
Qwen2.5 32B (19 GB),,65 (Δ22%),200 (Δ31%),314 (Δ28%),,,135 (Δ31%),166 (Δ30%),194 (Δ27%),253 (Δ32%),201 (Δ34%),
Command R 35B (18 GB),,86 (Δ33%),227 (Δ72%),355 (Δ67%),,,156 (Δ39%),189 (Δ85%),216 (Δ54%),313 (Δ61%),262 (Δ66%),
Dolphin Mixtral 8x7B (26 GB),,160 (Δ32%),,403 (Δ108%),,,249 (Δ70%),307 (Δ64%),340 (Δ55%),372 (Δ80%),330 (Δ44%),
