In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import json

task_ids = set([2,255,4,135,8,137,138,136,7,269,397,15,17,18,276,21,152,286,159,160,161,164,46,181,196,203,209,84,85,217,352,230,105,235,239,246,249,378,251,127])
ollama_models = ["llama2-uncensored:7b", "deepseek-r1:7b", "qwen2.5:14b", "phi4:14b", "deepseek-r1:32b", "qwen2.5:32b", "command-r:latest", "dolphin-mixtral:8x7b", "deepseek-r1:70b", "llama3.3:70b"]
oai_models = ["gpt-4o", "o1-mini", "o1-preview"]
models = set(ollama_models + oai_models)

with open("in-prompts.json", "r") as file:
    prompts = json.load(file)

with open("out-results.json.log", "r") as file:
    results = []

    for line in file:
        item = json.loads(line)

        if item.get("task_id") in task_ids and item.get("model") in models and "result" in item and "elapsed_time" in item:
            results.append(item)

items = defaultdict(lambda:defaultdict(dict))

for result in results:
    model = result.get("model", "not-a-model")
    runner = "oai" if (model in ["gpt-4o", "o1-mini"]) else result.get("runner", "unknown")
    task_id = result.get("task_id")
    response_text = None

    try:
        response_text = result["result"]["message"]["content"]
    except Exception:
        pass

    try:
        response_text = result["result"]["choices"][0]["message"]["content"]
    except Exception:
        pass

    items[runner][model][task_id] = len(prompts[task_id] + response_text) / result.get("elapsed_time")

for runner in items.keys():
    items[runner] = {k: v for k, v in items[runner].items() if len(v) == len(task_ids)}

    for model in items[runner].keys():
        values = list(items[runner][model].values())
        start, end = np.percentile(values, 10), np.percentile(values, 90)
        items[runner][model] = f"{int(start)}-{int(end)}"

# print(json.dumps(items, ensure_ascii=False, indent=2))
pd.DataFrame.from_dict(items, orient='index')