In [24]:
import pandas as pd
import json

prompts_corpus = set([2,255,4,135,8,137,138,136,7,269,397,15,17,18,276,21,152,286,159,160,161,164,46,181,196,203,209,84,85,217,352,230,105,235,239,246,249,378,251,127])
runners = {"oai": "OpenAI API", "m3-max": "Mac M3 Max 128GB", "rtx-4090": "RTX 4090 24GB", "rtx-5090": "RTX 5090 32GB", "rtx-a6000": "RTX A6000 48GB", "rtx-6000ada": "RTX 6000Ada 48GB", "dual-rtx-4090": "Dual RTX 4090 24GB", "dual-rtx-5090": "Dual RTX 5090 32GB", "a100": "A100 SXM4 40GB"}
ollama_models = {"llama2-uncensored:7b": "Llama 2 uncensored 7B (3.8 GB)", "deepseek-r1:7b": "DeepSeek-R1 7B (4.7 GB)", "qwen2.5:14b": "Qwen2.5 14B (9.0 GB)", "phi4:14b": "Phi-4 14B (9.1 GB)", "deepseek-r1:32b": "DeepSeek-R1 32B (19 GB)", "qwen2.5:32b": "Qwen2.5 32B (19 GB)", "command-r:latest": "Command R 35B (18 GB)", "dolphin-mixtral:8x7b": "Dolphin Mixtral 8x7B (26 GB)", "deepseek-r1:70b": "DeepSeek-R1 70B (42 GB)", "llama3.3:70b": "Llama 3.3 70B (42 GB)"}
oai_models = {"gpt-4o": "ChatGPT 4o", "o1-mini": "o1-mini", "o1-preview": "o1"}
models = {**oai_models, **ollama_models}

def get_text(result):
    try:
        return result["result"]["message"]["content"]
    except Exception:
        try:
            return result["result"]["choices"][0]["message"]["content"]
        except Exception:
            return None

with open("in-prompts.json", "r") as file:
    prompts = json.load(file)

with open("out-results.json.log", "r") as file:
    results = []

    for line in file:
        result = json.loads(line)
        model = result.get("model")
        runner = "oai" if (model in oai_models) else result.get("runner")
        prompt_id = result.get("task_id")

        if "result" in result and prompt_id in prompts_corpus and "elapsed_time" in result and runner in runners and model in models:
            results.append({
                "runner": runners[runner],
                "model": models[model],
                "prompt_id": prompt_id,
                "total_len": len(prompts[prompt_id]) + len(get_text(result)),
                "elapsed_time": result["elapsed_time"]
            })

df = pd.DataFrame(results).drop_duplicates(subset=["runner", "model", "prompt_id"])

In [None]:
df_count = df.groupby(["runner", "model"]).size().reset_index(name="num_items")

df_count.pivot(
    index="model",
    columns="runner",
    values="num_items"
)


In [None]:
df_filtered = df.groupby(["runner", "model"]).filter(lambda g: len(g) == len(prompts_corpus))
df_filtered["ratio"] = df_filtered["total_len"] / df_filtered["elapsed_time"]

df_median = df_filtered.groupby(["runner", "model"])["ratio"].median().reset_index()
table = df_median.pivot(index="model", columns="runner", values="ratio")

table.index.name = None
table.columns.name = None

runners_sorted = [r for r in runners.values() if r in table.columns]
models_sorted = [r for r in models.values() if r in table.index]

table = table.loc[models_sorted, runners_sorted]
table.applymap(lambda x: int(x) if pd.notna(x) else "")