In [8]:
from collections import defaultdict
import pandas as pd
import json
import os

prompts, names, runs, scores = {}, {}, [], defaultdict(lambda: defaultdict(int))

log_files = reversed(sorted([file_name for file_name in os.listdir() if file_name.endswith("json.log")]))
    
for log_file in log_files:    
    with (open(log_file, "r") as file):
        for line in file:
            item = json.loads(line)
    
            if item["type"] == "entity":
                names[item["key"]] = item["name"]
            elif item["type"] == "prompt":
                prompts[item["prompt_id"]] = item
            elif item["type"] == "score":
                scores[item["model"]][item["score"]] += 1
            elif item["type"] == "run" and "result" in item and "prompt_id" in item:
                result = item["result"]
                item["text"] = (result["choices"][0] if "choices" in result else result)["message"]["content"]
                item["len"] = len(item["text"]) + \
                              len(prompts[item["prompt_id"]]["prompt"])
                item["runner"] = "rtx-5090" if item["runner"].startswith("rtx-5090") else item["runner"]
                runs.append(item)

ok_models = set([model for model, verdicts in scores.items() if verdicts['ok'] / (verdicts['ok'] + verdicts['trash']) >= 0.95])
runs = [run for run in runs if run["model"] in ok_models]

df = pd.DataFrame(runs)

In [10]:
df["metric"] = df["len"] / df["elapsed_time"]

df_std = df.groupby(["runner", "model"])["metric"].agg(["median", "std", "count"])
df_std["combined"] = df_std.apply(lambda row: f"{row['median']:.0f}{'*' if row['std'] / row['median'] > 0.5 else ''}{'*' if row['count'] < 40 else ''}", axis=1)
table = df_std.reset_index().pivot(index="runner", columns="model", values="combined")

table.index.name = None
table.columns.name = None
name_order = list(names.keys())

def sort_key(value):
    return name_order.index(value) if value in name_order else float('inf')

table = table.loc[sorted(table.index, key=sort_key)]
table = table[sorted(table.columns, key=sort_key)]

table.index = table.index.map(lambda x: names.get(x, x))
table.columns = table.columns.map(lambda x: names.get(x, x))

table = table.applymap(lambda x: x if pd.notna(x) else "")

# print(table.to_markdown())
table

Unnamed: 0,Phi-4 14B,DeepSeek-R1 32B,Qwen2.5 32B,DeepSeek-R1 70B,Llama 3.3 70B,ChatGPT 4o,o1-mini
OpenAI API,,,,,,356.0,430.0
"RTX 4090 24GB, 3400€",418.0,185.0,200.0,,,,
"RTX 5090 32GB, 5600€",607.0,284.0,314.0,23*,,,
Dual RTX 4090 24GB 6000€,411.0,170.0,194.0,91,95,,
"Mac M3 Max 128GB, 6250€",136.0,64.0,65.0,28,29,,
RTX A6000 48GB 7550€,280.0,129.0,135.0,66,70,,
Dual RTX 5090 32GB 8100€,579.0,234.0,253.0,129,136,,
RTX 6000Ada 48GB 10 000€,343.0,151.0,166.0,78,79,,
A100 SXM4 40GB 23 000€,405.0,185.0,201.0,,,,
RTX 4080 16GB,303.0,,,,,,


In [13]:
print(table.to_markdown())

|                          | Phi-4 14B   | DeepSeek-R1 32B   | Qwen2.5 32B   | DeepSeek-R1 70B   | Llama 3.3 70B   | ChatGPT 4o   | o1-mini   |
|:-------------------------|:------------|:------------------|:--------------|:------------------|:----------------|:-------------|:----------|
| OpenAI API               |             |                   |               |                   |                 | 356          | 430       |
| RTX 4090 24GB, 3400€     | 418         | 185               | 200           |                   |                 |              |           |
| RTX 5090 32GB, 5600€     | 607         | 284               | 314           | 23*               |                 |              |           |
| Dual RTX 4090 24GB 6000€ | 411         | 170               | 194           | 91                | 95              |              |           |
| Mac M3 Max 128GB, 6250€  | 136         | 64                | 65            | 28                | 29              |              |     