In [1]:
from collections import defaultdict
import json
import os

task_ids = set([2,255,4,135,8,137,138,136,7,269,397,15,17,18,276,21,152,286,159,160,161,164,46,181,196,203,209,84,85,217,352,230,105,235,239,246,249,378,251,127])
ollama_models = ["llama2-uncensored:7b", "deepseek-r1:7b", "qwen2.5:14b", "phi4:14b", "deepseek-r1:32b", "qwen2.5:32b", "command-r:latest", "dolphin-mixtral:8x7b", "deepseek-r1:70b", "llama3.3:70b"]
oai_models = ["gpt-4o", "o1-mini", "o1-preview"]
models = set(ollama_models + oai_models)

with open("in-prompts.json", "r") as file:
    prompts = json.load(file)

runs = defaultdict(dict)

for file in os.listdir("."):
    if not file.startswith("out-results"):
        continue

    with open(file, "r") as file:
        for line in file:
            item = json.loads(line)
            model = item.get("model")
            runner = "oai" if (model in ["gpt-4o", "o1-mini"]) else item.get("runner", "unknown")
            task_id = item.get("task_id")

            if task_id in task_ids and model in models and "result" in item and "elapsed_time" in item:
                runs[(runner, model)][task_id] = item

In [None]:
import numpy as np

stats = defaultdict(lambda:defaultdict(lambda:"N/A"))

def get_text(result):
    try:
        return result["result"]["message"]["content"]
    except Exception:
        try:
            return result["result"]["choices"][0]["message"]["content"]
        except Exception:
            return None

for (runner, model), value in runs.items():
    stats[runner][model] = len(value)
    # if len(value) == len(task_ids):
        # perfs = [len(prompts[task_id] + get_text(result)) / result.get("elapsed_time") for task_id, result in value.items()]
        # stats[runner][model] = f"{int(np.percentile(perfs, 10))}-{int(np.percentile(perfs, 90))}"
        # stats[runner][model] = int(np.percentile(perfs, 50))

print(json.dumps(stats, ensure_ascii=False, indent=2))

In [None]:
import pandas as pd

pd.DataFrame({runner: [data[model] for model in ollama_models] for runner, data in stats.items()})