In [65]:
import pandas as pd
import wandb
import json
from collections import defaultdict

DATA = defaultdict(list)

api = wandb.Api()
entity, project = "itl", "plancraft"
runs = api.runs(entity + "/" + project)

summary_list, config_list, name_list = [], [], []
for run in runs:
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)
    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})
    # .name is the human-readable name of the run.
    name_list.append(run.name)

    for artifact in run.logged_artifacts():
        table_name = artifact.name.split(":")[0].split("-")[-1]
        id_name = artifact.id
        if "table" not in artifact.type:
            continue

        dir_name = artifact.download()
        table_path = f"{dir_name}/{table_name}.table.json"
        with open(table_path, "r") as f:
            json_dict = json.load(f)
        df = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])
        for key, value in config_list[-1].items():
            df[key] = value

        df["run_id"] = run.id
        DATA[table_name].append(df)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list}
)

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m: 

In [64]:
config_list[0]

{'mode': 'react',
 'model': 'CohereForAI/c4ai-command-r-plus-4bit',
 'guidance': False,
 'quantize': 'int4',
 'max_steps': 20,
 'num_generations': 5}

In [66]:
results_df = pd.concat(DATA["results"])
group_results_df = pd.concat(DATA["group_results"])
results_df

Unnamed: 0,hash_key,target,group,question,plan,logs,message_history,errors,success,number_of_steps,number_of_thinking_steps,model_name,tokens_used,mode,model,guidance,quantize,max_steps,num_generations,run_id
0,react_CohereForAI/c4ai-command-r-v01_planks_4,planks,MT1,How to obtain planks?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'missing_materials': 2, 'missing_tools': 1}",True,5,5,CohereForAI/c4ai-command-r-v01,14778,react,CohereForAI/c4ai-command-r-v01,False,int4,20,5,fbn70g0a
1,react_CohereForAI/c4ai-command-r-v01_stick_4,stick,MT1,How to obtain stick?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{'missing_materials': 1},True,4,4,CohereForAI/c4ai-command-r-v01,11664,react,CohereForAI/c4ai-command-r-v01,False,int4,20,5,fbn70g0a
2,react_CohereForAI/c4ai-command-r-v01_wooden_sl...,wooden_slab,MT1,How to obtain wooden_slab?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{'missing_materials': 2},True,5,5,CohereForAI/c4ai-command-r-v01,14630,react,CohereForAI/c4ai-command-r-v01,False,int4,20,5,fbn70g0a
3,react_CohereForAI/c4ai-command-r-v01_wooden_bu...,wooden_button,MT1,How to obtain wooden_button?,"[{'output': 'log', 'quantity_needed': 2, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{},True,3,3,CohereForAI/c4ai-command-r-v01,8718,react,CohereForAI/c4ai-command-r-v01,False,int4,20,5,fbn70g0a
4,react_CohereForAI/c4ai-command-r-v01_wooden_pr...,wooden_pressure_plate,MT1,How to obtain wooden_pressure_plate?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{},True,3,3,CohereForAI/c4ai-command-r-v01,8670,react,CohereForAI/c4ai-command-r-v01,False,int4,20,5,fbn70g0a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,react_mistralai/Mistral-7B-Instruct-v0.2_item_...,item_frame,MT3,How to obtain item_frame?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'You are a helper AI agent in Min...,"{'action_type_mismatch': 5, 'insufficient_mate...",False,20,20,mistralai/Mistral-7B-Instruct-v0.2,94248,react,mistralai/Mistral-7B-Instruct-v0.2,True,,20,5,g4m3a9ug
66,react_mistralai/Mistral-7B-Instruct-v0.2_cooke...,cooked_porkchop,MT3,How to obtain cooked_porkchop?,"[{'output': 'porkchop', 'quantity_needed': 1, ...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'You are a helper AI agent in Min...,{'action_type_mismatch': 1},True,3,3,mistralai/Mistral-7B-Instruct-v0.2,9952,react,mistralai/Mistral-7B-Instruct-v0.2,True,,20,5,g4m3a9ug
67,react_mistralai/Mistral-7B-Instruct-v0.2_cooke...,cooked_beef,MT3,How to obtain cooked_beef?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'You are a helper AI agent in Min...,"{'action_type_mismatch': 6, 'missing_materials...",False,20,20,mistralai/Mistral-7B-Instruct-v0.2,93739,react,mistralai/Mistral-7B-Instruct-v0.2,True,,20,5,g4m3a9ug
68,react_mistralai/Mistral-7B-Instruct-v0.2_cooke...,cooked_mutton,MT3,How to obtain cooked_mutton?,"[{'output': 'wool', 'quantity_needed': 1, 'too...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'You are a helper AI agent in Min...,{'action_type_mismatch': 18},False,20,20,mistralai/Mistral-7B-Instruct-v0.2,92205,react,mistralai/Mistral-7B-Instruct-v0.2,True,,20,5,g4m3a9ug


In [77]:
results_df.groupby(["model_name", "group"]).agg(
    {"success": ["mean", "std", "count"], "number_of_steps": ["mean", "std"], "tokens_used": ["mean", "std"]}
).reset_index()

Unnamed: 0_level_0,model_name,group,success,success,success,number_of_steps,number_of_steps,tokens_used,tokens_used
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std
0,CohereForAI/c4ai-command-r-v01,MT1,0.853846,0.354627,130,10.038462,5.589863,29033.553846,16098.775563
1,CohereForAI/c4ai-command-r-v01,MT2,0.566667,0.497613,120,15.108333,5.365304,44368.416667,15989.037224
2,CohereForAI/c4ai-command-r-v01,MT3,0.514286,0.503405,70,16.714286,4.025797,48260.771429,11520.513171
3,CohereForAI/c4ai-command-r-v01,MT4,0.275,0.449331,80,18.4875,2.933843,53232.1125,8622.727958
4,CohereForAI/c4ai-command-r-v01,MT5,0.166667,0.374766,90,19.055556,2.400192,56148.977778,7472.714798
5,CohereForAI/c4ai-command-r-v01,MT6,0.157143,0.366563,70,19.485714,1.442048,57881.571429,4630.584893
6,CohereForAI/c4ai-command-r-v01,MT7,0.038462,0.193052,130,19.792308,1.118527,58394.330769,3958.316593
7,CohereForAI/c4ai-command-r-v01,MT8,0.0,0.0,10,20.0,0.0,58497.2,1226.494988
8,meta-llama/Llama-2-13b-chat-hf,MT1,0.061538,0.241245,130,19.407692,2.517216,61303.392308,8427.770759
9,meta-llama/Llama-2-13b-chat-hf,MT2,0.016667,0.128556,120,19.841667,1.263554,61953.208333,5019.702095


In [80]:
commandr = results_df[results_df.model_name.str.contains("Cohere")].copy()

In [83]:
commandr.groupby(["guidance", "group"]).agg(
    {"success": ["mean", "std", "count"], "number_of_steps": ["mean", "std"], "tokens_used": ["mean", "std"]}
).reset_index()

Unnamed: 0_level_0,guidance,group,success,success,success,number_of_steps,number_of_steps,tokens_used,tokens_used
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std
0,False,MT1,0.846154,0.363609,65,9.676923,5.750293,28145.984615,16597.531666
1,False,MT2,0.616667,0.490301,60,14.3,5.591458,42484.833333,16842.521126
2,False,MT3,0.314286,0.471008,35,17.514286,4.245215,50388.971429,12059.048517
3,False,MT4,0.125,0.334932,40,19.125,2.503203,55421.075,7718.503048
4,False,MT5,0.177778,0.386646,45,19.0,2.584921,56144.244444,8011.134045
5,False,MT6,0.2,0.40584,35,19.228571,1.864304,58115.171429,5923.196005
6,False,MT7,0.030769,0.174036,65,19.861538,0.826834,59254.8,3414.018569
7,False,MT8,0.0,0.0,5,20.0,0.0,58863.8,1304.982644
8,True,MT1,0.861538,0.348072,65,10.4,5.445181,29921.123077,15662.453694
9,True,MT2,0.516667,0.503939,60,15.916667,5.046535,46252.0,14991.50107


In [86]:
# commandr

In [111]:
def aggregate_errors_dict(rows):
    errors = defaultdict(int)
    for error in rows["errors"]:
        for key, value in error.items():
            errors[key] += value
    return errors


errors_commandr = (
    commandr.groupby(["guidance", "group"])
    .apply(aggregate_errors_dict)
    .apply(pd.Series)
    .reset_index()
)

errors_columns = [
    "missing_materials",
    "missing_tools",
    "insufficient_materials",
    "parsing_error",
    "action_type_mismatch",
    "unknown_item",
]

errors_commandr["total"] = errors_commandr[errors_columns].sum(axis=1)
errors_commandr[errors_columns] = errors_commandr[errors_columns].div(
    errors_commandr["total"], axis=0
)
errors_commandr.reset_index()

Unnamed: 0,index,guidance,group,missing_materials,missing_tools,insufficient_materials,parsing_error,action_type_mismatch,unknown_item,total
0,0,False,MT1,0.31348,0.200627,0.206897,0.087774,0.087774,0.103448,319.0
1,1,False,MT2,0.422535,0.217304,0.118712,0.090543,0.112676,0.038229,497.0
2,2,False,MT3,0.296296,0.143519,0.071759,0.134259,0.196759,0.157407,432.0
3,3,False,MT4,0.320721,0.214414,0.072072,0.131532,0.21982,0.041441,555.0
4,4,False,MT5,0.313458,0.214651,0.124361,0.044293,0.199319,0.103918,587.0
5,5,False,MT6,0.464126,0.226457,0.103139,0.053812,0.100897,0.05157,446.0
6,6,False,MT7,0.349943,0.268403,0.088335,0.105323,0.133635,0.05436,883.0
7,7,False,MT8,0.3,0.271429,0.057143,0.157143,0.157143,0.057143,70.0
8,8,True,MT1,0.350453,0.308157,0.241692,,0.099698,,331.0
9,9,True,MT2,0.365385,0.438811,0.08042,,0.115385,,572.0


In [131]:
commandr_guidance = commandr[commandr["guidance"]].copy()
commandr_guidance.head()


Unnamed: 0,hash_key,target,group,question,plan,logs,message_history,errors,success,number_of_steps,number_of_thinking_steps,model_name,tokens_used,mode,model,guidance,quantize,max_steps,num_generations,run_id
0,react_CohereForAI/c4ai-command-r-v01_planks_4,planks,MT1,How to obtain planks?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{},True,2,2,CohereForAI/c4ai-command-r-v01,5630,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,c2eshror
1,react_CohereForAI/c4ai-command-r-v01_stick_4,stick,MT1,How to obtain stick?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,{'missing_materials': 2},True,6,6,CohereForAI/c4ai-command-r-v01,17254,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,c2eshror
2,react_CohereForAI/c4ai-command-r-v01_wooden_sl...,wooden_slab,MT1,How to obtain wooden_slab?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'insufficient_materials': 1, 'missing_materia...",True,7,7,CohereForAI/c4ai-command-r-v01,20332,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,c2eshror
3,react_CohereForAI/c4ai-command-r-v01_wooden_bu...,wooden_button,MT1,How to obtain wooden_button?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'insufficient_materials': 1, 'missing_materia...",True,5,5,CohereForAI/c4ai-command-r-v01,14380,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,c2eshror
4,react_CohereForAI/c4ai-command-r-v01_wooden_pr...,wooden_pressure_plate,MT1,How to obtain wooden_pressure_plate?,"[{'output': 'log', 'quantity_needed': 9, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'action_type_mismatch': 1, 'missing_materials...",True,5,5,CohereForAI/c4ai-command-r-v01,14421,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,c2eshror


In [149]:
commandr_guidance.tail()

Unnamed: 0,hash_key,target,group,question,plan,logs,message_history,errors,success,number_of_steps,number_of_thinking_steps,model_name,tokens_used,mode,model,guidance,quantize,max_steps,num_generations,run_id
65,react_CohereForAI/c4ai-command-r-v01_item_frame_0,item_frame,MT3,How to obtain item_frame?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'insufficient_materials': 1, 'missing_materia...",True,13,13,CohereForAI/c4ai-command-r-v01,37028,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,jw3ndyj2
66,react_CohereForAI/c4ai-command-r-v01_cooked_po...,cooked_porkchop,MT3,How to obtain cooked_porkchop?,"[{'output': 'porkchop', 'quantity_needed': 1, ...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'action_type_mismatch': 1, 'missing_materials...",True,15,15,CohereForAI/c4ai-command-r-v01,44276,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,jw3ndyj2
67,react_CohereForAI/c4ai-command-r-v01_cooked_be...,cooked_beef,MT3,How to obtain cooked_beef?,"[{'output': 'beef', 'quantity_needed': 1, 'too...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'action_type_mismatch': 1, 'insufficient_mate...",False,20,20,CohereForAI/c4ai-command-r-v01,57717,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,jw3ndyj2
68,react_CohereForAI/c4ai-command-r-v01_cooked_mu...,cooked_mutton,MT3,How to obtain cooked_mutton?,"[{'output': 'log', 'quantity_needed': 3, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'missing_materials': 3, 'missing_tools': 6}",True,17,17,CohereForAI/c4ai-command-r-v01,51644,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,jw3ndyj2
69,react_CohereForAI/c4ai-command-r-v01_diamond_0,diamond,MT8,How to obtain diamond?,"[{'output': 'log', 'quantity_needed': 4, 'tool...","Step 1 inventory: defaultdict(<class 'int'>, {...",[{'content': 'TASK: How to obtain iron_ingot? ...,"{'insufficient_materials': 1, 'missing_materia...",False,20,20,CohereForAI/c4ai-command-r-v01,58088,react,CohereForAI/c4ai-command-r-v01,True,int4,20,5,jw3ndyj2


In [150]:
idx = 69

print("Task Success: ", commandr_guidance.success.values[idx])
print("Task Target: ", commandr_guidance.target.values[idx])
print("Task Errors: ", dict(commandr_guidance.errors.values[idx]))
print("messages:")
# skip the first 44 messages (prompt)
for m in commandr_guidance.message_history.values[idx][44:]:
    print(m)
print("logs:")
print(commandr_guidance.logs.values[idx])

Task Success:  False
Task Target:  diamond
Task Errors:  {'action_type_mismatch': 1, 'insufficient_materials': 2, 'missing_materials': 1, 'missing_tools': 7}
messages:
{'content': 'TASK: How to obtain diamond?\ninventory = diamond_axe', 'role': 'user'}
{'content': '{"thought":"I already have a diamond axe, so I have obtained a diamond."}', 'role': 'assistant'}
{'content': 'OK', 'role': 'user'}
{'content': '{"type":"mine","output":"diamond","quantity":1,"tool":"diamond_shovel","materials":[]}', 'role': 'assistant'}
{'content': "ERROR: missing_tools ['iron_pickaxe']\ninventory = {'diamond_axe': 1}", 'role': 'user'}
{'content': '{"thought":"I need to craft an iron pickaxe to mine diamond. To craft an iron pickaxe, I first need to smelt iron ore to get iron ingots."}', 'role': 'assistant'}
{'content': 'OK', 'role': 'user'}
{'content': '{"type":"smelt","output":"iron_ingot","quantity":2,"tool":null,"materials":["iron_ore"]}', 'role': 'assistant'}
{'content': "ERROR: missing_tools ['furnace'