In [10]:
import pandas as pd
import wandb
import json
from collections import defaultdict

DATA = defaultdict(list)

api = wandb.Api()
entity, project = "itl", "plancraft"
runs = api.runs(entity + "/" + project)

data = []
for run in runs:
    if run.state != "finished":
        continue
    #  We call ._json_dict to omit large files
    summary = run.summary._json_dict
    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config = {k: v for k, v in run.config.items() if not k.startswith("_")}
    # .name is the human-readable name of the run.
    name = run.name

    for file in run.files():
        if ".json" not in file.name or "outputs" not in file.name:
            continue
        file.download(exist_ok=True)

        with open(file.name) as f:
            example = json.load(f)

        example["run_name"] = name
        example["run_id"] = run.id
        example["config"] = config
        example["summary"] = summary
        data.append(example)

In [9]:
import pandas as pd

# unlimited columns
pd.set_option("display.max_columns", None)

df = pd.DataFrame(data)

config_df = df.config.apply(pd.Series)
plancraft_df = config_df.plancraft.apply(pd.Series)
env_df = plancraft_df.environment.apply(pd.Series)

df = pd.concat([df, plancraft_df, env_df], axis=1)
df = df.drop(columns=["config", "environment"])
df = df[df["mode"] != "dummy"]

df.groupby(["mode", "model", "split"]).agg(
    {
        "success": ["mean", "count"],
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,success,success
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count
mode,model,split,Unnamed: 3_level_2,Unnamed: 4_level_2
oracle,gpt-4o,val,0.994641,933
react,CohereForAI/c4ai-command-r-v01,val.small.easy,0.211765,85
react,gpt-4o,val.small,0.360825,97
react,gpt-4o,val.small.easy,1.0,17
react,meta-llama/Meta-Llama-3-8B-Instruct,val.small.easy,0.388235,85
react,microsoft/Phi-3-mini-128k-instruct,val.small.easy,0.0,68
react,mistralai/Mistral-7B-Instruct-v0.2,val.small.easy,0.0,34


In [166]:
oracle_lengths = {}
for i, row in df[df["mode"] == "oracle"].iterrows():
    oracle_lengths[row["example_id"]] = row["number_of_steps"]

def normalise_opt_path(row):
    if row["example_id"] in oracle_lengths:
        return row["number_of_steps"] / oracle_lengths[row["example_id"]]
    return None

df["normed_steps"] = df.apply(normalise_opt_path, axis=1)

In [167]:
df[df.success].groupby(["mode", "model", "split"]).agg(
    {
        "normed_steps": ["mean", "count"],
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,normed_steps,normed_steps
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count
mode,model,split,Unnamed: 3_level_2,Unnamed: 4_level_2
oracle,gpt-4o,val,1.0,928
react,gpt-4o,val.small,1.093878,35
react,gpt-4o,val.small.easy,1.098039,17
react,meta-llama/Meta-Llama-3-8B-Instruct,val.small.easy,1.636364,33


In [126]:
# df[df.impossible].groupby(["mode", "model", "split"]).agg(
#     {
#         "normed_steps": ["mean", "count"],
#     }
# )
# df[df["mode"]=="oracle"][["example_id", "number_of_steps", "normed_steps"]]

Unnamed: 0,example_id,number_of_steps,normed_steps
17,VAL0101,3,3.0
18,VAL0115,1,1.0
19,VAL0147,1,1.0
20,VAL0159,1,1.0
21,VAL0402,1,1.0
22,VAL0424,1,1.0
23,VAL0468,1,1.0
24,VAL0521,1,1.0
25,VAL0639,3,3.0
26,VAL0642,3,3.0


In [66]:
# df.model_trace

In [70]:
df.groupby(["mode", "model", "run_name"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,success,number_of_steps,model_trace,example_id,run_id,summary,split,resume,quantize,max_steps,batch_size,output_dir,num_generations,max_message_window,symbolic,resolution,preferred_spawn_biome,symbolic_action_space,symbolic_observation_space
mode,model,run_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
react,meta-llama/Meta-Llama-3-8B-Instruct,avid-universe-20,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,meta-llama/Meta-Llama-3-8B-Instruct,feasible-snowball-18,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,meta-llama/Meta-Llama-3-8B-Instruct,lilac-voice-16,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,meta-llama/Meta-Llama-3-8B-Instruct,trim-forest-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,meta-llama/Meta-Llama-3-8B-Instruct,wild-terrain-19,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,microsoft/Phi-3-mini-128k-instruct,happy-shadow-21,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,microsoft/Phi-3-mini-128k-instruct,neat-shadow-24,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,microsoft/Phi-3-mini-128k-instruct,silver-paper-25,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,microsoft/Phi-3-mini-128k-instruct,sweet-galaxy-23,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
react,microsoft/Phi-3-mini-128k-instruct,swept-pond-22,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17


In [41]:
df[df.split == "val.small.easy"].groupby(["example_id"]).count()

Unnamed: 0_level_0,success,number_of_steps,model_trace,run_name,run_id,summary,mode,model,split,resume,quantize,max_steps,batch_size,output_dir,num_generations,max_message_window,symbolic,resolution,preferred_spawn_biome,symbolic_action_space,symbolic_observation_space
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VAL0101,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0115,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0147,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
VAL0159,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0402,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0424,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
VAL0468,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0521,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
VAL0639,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
VAL0642,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9


In [105]:
class FakeH:
    def __init__(self, **kwargs):
        self.obs = []

    def append(self, obs):
        self.obs.append(obs)


class FakeM:
    def __init__(self, **kwargs):
        self.hs = [FakeH(**kwargs)]

    def fake(self):
        for i in range(len(self.hs)):
            h = self.hs[i]
            h.append(1)


M = FakeM()
M.fake()
M.hs[0].obs

[1]