# Minecraft Text Planning Task

In [2]:
import json

path = "data/task_info.json"
with open(path, "r") as f:
    data = json.load(f)

print(data)
len(data)

{'obtain_planks': {'question': 'How to obtain planks?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'planks'}, 'obtain_stick': {'question': 'How to obtain stick?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'stick'}, 'obtain_wooden_slab': {'question': 'How to obtain wooden_slab?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'wooden_slab'}, 'obtain_wooden_button': {'question': 'How to obtain wooden_button?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'wooden_button'}, 'obtain_wooden_pressure_plate': {'question': 'How to obtain wooden_pressure_plate?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'wooden_pressure_plate'}, 'obtain_chest': {'question': 'How to obtain chest?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'chest'}, 'obtain_oak_stairs': {'question': 'How to obtain oak_stairs?', 'group': 'MT1', 'alias': 'basic', 'episode': 3000, 'object': 'oak_stairs'}, 'obtain_sign': {'questi

70

In [3]:
import json
import math

TECH_TREE = {}

with open("data/goal_lib.json", "r") as f:
    goal_lib = json.load(f)
for g in goal_lib:
    k = g.replace("smelt_", "").replace("craft_", "").replace("mine_", "")
    TECH_TREE[k] = goal_lib[g]
    TECH_TREE[k]["name"] = k

print(len(TECH_TREE))


def is_tool(item):
    return "pickaxe" in item or "furnace" in item or "crafting_table" in item


def get_plan(target, need=1) -> list[dict]:
    goal = TECH_TREE[target]
    goal["quantity_needed"] = need
    goal["depth"] = 0
    tree = {target: goal}

    def travel_tech_tree(current: str, quantity_needed: int, depth=1):
        """
        Recursive function to travel the tech tree
        """
        # add children
        requirements = TECH_TREE[current]["precondition"] | TECH_TREE[current]["tool"]
        quantity_to_produce = TECH_TREE[current]["output"][current]

        for r in requirements:
            cost_to_produce = requirements[r]

            # if we need to produce more than single step (ignore tools)
            if quantity_to_produce < quantity_needed and not is_tool(r):
                cost_to_produce = math.ceil(
                    cost_to_produce * (quantity_needed / quantity_to_produce)
                )
            # node already exists
            # print(requirements, r, cost_to_produce)
            if r in tree:
                # tools are multi-use
                if is_tool(r):
                    tree[r]["depth"] = max(tree[r]["depth"], depth)
                    return

                tree[r]["quantity_needed"] += cost_to_produce
                tree[r]["depth"] = max(tree[r]["depth"], depth)
                travel_tech_tree(r, cost_to_produce, depth=depth + 1)
                # return

            # new tech
            else:
                tree[r] = TECH_TREE[r]
                tree[r]["quantity_needed"] = cost_to_produce
                tree[r]["depth"] = depth
                travel_tech_tree(r, cost_to_produce, depth=depth + 1)

    travel_tech_tree(target, need)

    # sort by depth
    plan = sorted(tree.values(), key=lambda x: x["depth"], reverse=True)
    return plan


def parse_target_for_wood_type(name: str):
    if "_log" in name: 
        return "log"
    if "_planks" in name:
        return "planks"
    if "diamond_ore" in name:
        return "diamond"
    return name
    

def process_step(goal, current_inventory, tech_tree):
    """
    Process a single step of the plan and update the inventory accordingly.

    Parameters:
    - goal: A dictionary representing the goal to achieve in this step.
    - current_inventory: A dictionary representing the current inventory of items.
    - tech_tree: A dictionary representing the tech tree, which includes information about each item.

    Returns:
    - A tuple (success: bool, error_type: str or None, error_value: any)
    """
    success = True

    if len(goal) == 0:
        return False, "parsing_error", goal

    target = list(goal["output"].keys())[0]
    target = parse_target_for_wood_type(target)
    if target not in tech_tree:
        return False, "unknown_item", target
    if goal["type"] != tech_tree[target]["type"]:
        return False, "action_type_mismatch", goal["type"]
    if not set(tech_tree[target]["tool"].keys()).issubset(set(goal["tool"].keys())):
        return False, "missing_tools", set(tech_tree[target]["tool"].keys())
    if not set(tech_tree[target]["precondition"].keys()).issubset(
        set(current_inventory.keys())
    ):
        return (
            False,
            "missing_materials",
            set(tech_tree[target]["precondition"].keys()),
        )

    # Add the outcome to the inventory
    quantity_needed = goal["quantity_needed"]
    while quantity_needed > 0:
        for item in tech_tree[target]["precondition"]:
            if current_inventory[item] - tech_tree[target]["precondition"][item] < 0:
                return False, "insufficient_materials", item
            current_inventory[item] -= tech_tree[target]["precondition"][item]

        current_inventory[target] += tech_tree[target]["output"][target]
        quantity_needed -= tech_tree[target]["output"][target]

    return success, None, None


def evaluate_generated_plan(parsed_plan, tech_tree):
    success = True
    current_inventory = defaultdict(int)
    current_inventory["diamond_axe"] = 1

    for goal in parsed_plan:
        success, error_type, error_value = process_step(
            goal, current_inventory, tech_tree
        )
        if not success:
            return False, error_type, error_value

    return success, "", None

88


In [4]:
print("Objects not seen as targets in tasks")
print(set(TECH_TREE.keys())-set([d.replace("obtain_", "") for d in data.keys()]))

Objects not seen as targets in tasks
{'cobblestone', 'quartz_block', 'log', 'leather', 'iron_ingot', 'armor_stand', 'diamond_shovel', 'mutton', 'beef', 'stonebrick', 'iron_ore', 'anvil', 'iron_block', 'porkchop', 'jukebox', 'wool', 'stone_button', 'stone_brick_stairs'}


In [5]:
lengths = []
for d in data:
    target = d.replace("obtain_", "")
    plan = get_plan(target)
    lengths.append(len(plan))
print("Average length of gold plans")
print(sum(lengths) / len(lengths))
print("Min and max length of gold plans")
print(min(lengths))
print(max(lengths))

Average length of gold plans
7.785714285714286
Min and max length of gold plans
2
14


# One shot planning

In [147]:
import time
import json

from tqdm import tqdm
from baseline_llm import OneShotOpenAILLM

models = ["gpt-4-turbo-preview"]


def load_openai_key(openai_key_file) -> str:
    with open(openai_key_file, "r") as f:
        context = f.read()
    return context.split("\n")[0]


api_key = load_openai_key("data/openai_keys.txt")
N = 5

with open("data/results.json", "r") as f:
    results = json.load(f)

# results = []
with tqdm(total=len(models) * len(data) * N) as pbar:
    for model_name in models:
        for k, v in data.items():
            for i in range(N):
                model = OneShotOpenAILLM(api_key=api_key, model=model_name)
                question = v["question"]
                target = question.split()[-1].replace("?", "")
                if target == "iron_pickaxe":
                    continue
                generation = model.generate(question, temperature=1.0, max_tokens=512)

                parsed_plan = model.parse_generated_plan(generation)
                gold_plan = get_plan(target)

                results.append(
                    {
                        "target": target,
                        "question": question,
                        "plan": parsed_plan,
                        "gold_plan": gold_plan,
                        "generation": generation,
                        "group": v["group"],
                        "model": model_name,
                        "tokens_used": model.token_used,
                    }
                )
                time.sleep(1)
                pbar.update(1)
            # save results
            with open("data/results.json", "w") as f:
                json.dump(results, f)


In [150]:
from collections import defaultdict

model = OneShotOpenAILLM(api_key=api_key, model=model_name)

with open("data/results.json", "r") as f:
    results = json.load(f)

success = 0
errors = defaultdict(int)
# plan_length = []
one_shot_data = []
for r in results:
    plan = model.parse_generated_plan(r["generation"])
    suc, err, A = evaluate_generated_plan(plan, TECH_TREE)
    if suc:
        success += 1
        # plan_length.append((len(r["plan"]), len(gold_plan)))
    else:
        errors[err] += 1

    one_shot_data.append(
        {
            "target": r["target"],
            "group": r["group"],
            "success": suc,
            "error": err,
            "plan": r["plan"],
            "gold_plan": r["gold_plan"],
            "plan_length": len(r["plan"]),
            "gold_plan_length": len(r["gold_plan"]),
            "missing": A,
            "model": r["model"],
            "tokens_used": r["tokens_used"],
        }
    )


Error parsing ['obtain_cobblestone_wall', "{'diamond_axe'})"]
Error parsing ['craft', "{'stick': 1, 'cobblestone': 1}, {'diamond_axe': 1})"]
Error parsing ['mine', "{'cobblestone': 1, 'stick': 1}, {'diamond_axe': 1})"]
Error parsing ['mine', "{'coal': 1, 'stick': 1}, {'diamond_axe': 1})"]
Error parsing ['mine', "{'coal': 1, 'stick': 1}, {'diamond_axe': 1})"]
Error parsing ['obtain_wooden_axe', "{'diamond_axe'})"]
Error parsing ['obtain_wooden_axe', "{'diamond_axe'})"]
Error parsing ['obtain_wooden_hoe', "{'diamond_axe'})"]
Error parsing ['obtain_wooden_sword', "{'diamond_axe': 1})"]
Error parsing ['craft', "{'bucket': 1, 'iron_pickaxe': 1}, {'iron_ingot': 3, 'wooden_shovel': 1, 'crafting_table': 1})"]
Error parsing ['craft', "{'iron_ingot': 3, 'bucket': 1}, {'iron_ore': 3, 'crafting_table': 1})"]
Error parsing ['craft', "{'iron_ingot': 3, 'stick': 2, 'crafting_table': 1}, {'diamond_axe': 1})"]
Error parsing ['mine', "{'cow': 3})"]
Error parsing ['mine', "{'leather': 8, 'log': 3}, {'dia

# Reactive Planning

In [6]:
import time
import json

from collections import defaultdict
from tqdm import tqdm
from baseline_llm import ReactOpenAILLM

models = ["gpt-3.5-preview"]


def load_openai_key(openai_key_file) -> str:
    with open(openai_key_file, "r") as f:
        context = f.read()
    return context.split("\n")[0]


api_key = load_openai_key("data/openai_keys.txt")
N = 1
MAX_STEPS = 20
# react_results = []
            
with open("data/react_results.json", "r") as f:
    react_results = json.load(f)

# with tqdm(total=len(data) * N) as pbar:
for k, v in data.items():
    model = ReactOpenAILLM(api_key=api_key, model="gpt-3.5-turbo")
    step = 1
    question = v["question"]
    print(question)
    target = question.split()[-1].replace("?", "")

    if target == "iron_pickaxe":
        continue

    inventory = defaultdict(int)
    inventory["diamond_axe"] = 1
    print(f"Initial inventory: {inventory}")

    plan = []
    history = ""
    errors = defaultdict(int)
    task_success = False

    action_step = model.generate_initial_step(question, temperature=1.0, max_tokens=512)

    while not task_success and step < MAX_STEPS:
        history += f"Step {step} inventory: {inventory}\n"
        parsed_action_step = model.parse_step(action_step)
        success, error_type, error_value = process_step(
            parsed_action_step, inventory, TECH_TREE
        )
        if success:
            print(f"Step {step} successful")
            history += f"Step {step} successful: {parsed_action_step}\n"
            plan.append(parsed_action_step)
            output = parsed_action_step["output"]
            item = list(output.keys())[0]
            count = output[item]
            
            observation = f"Success\ninventory = {dict(inventory)}"
            action_step = model.generate_step(
                observation, temperature=1.0, max_tokens=512
            )
            if target in parsed_action_step["output"]:
                task_success = True
                break
        else:
            print(f"Step {step} failed: {error_type} {error_value}")
            history += f"Step {step} failed: {error_type} {error_value}\n"
            errors[error_type] += 1
            observation = (
                f"ERROR: {error_type} {error_value}\ninventory = {dict(inventory)}"
            )
            print(f"Step {step} observation: {observation}")
            action_step = model.generate_step(
                observation, temperature=1.0, max_tokens=512
            )

        step += 1

    react_results.append(
        {
            "target": target,
            "question": question,
            "plan": plan,
            "logs": history,
            "message_history": model.history[len(model.example) + 1 :],
            "errors": errors,
            "success": task_success,
            "number_of_steps": step,
            "number_of_thinking_steps": model.num_thinking_steps,
            "model": model.model,
            "tokens_used": model.token_used,
            "group": v["group"],
        }
    )

    with open("data/react_results.json", "w") as f:
        json.dump(react_results, f)

How to obtain planks?
Initial inventory: defaultdict(<class 'int'>, {'diamond_axe': 1})
Step 0 successful
Step 1 successful
How to obtain stick?
Initial inventory: defaultdict(<class 'int'>, {'diamond_axe': 1})
Step 0 successful
Step 1 successful
Step 2 successful
How to obtain wooden_slab?
Initial inventory: defaultdict(<class 'int'>, {'diamond_axe': 1})
Step 0 failed: missing_materials {'planks'}
Step 0 observation: ERROR: missing_materials {'planks'}
inventory = {'diamond_axe': 1}
Step 1 successful
Step 2 successful
Step 3 successful
How to obtain wooden_button?
Initial inventory: defaultdict(<class 'int'>, {'diamond_axe': 1})
Step 0 successful
Step 1 failed: missing_materials {'planks'}
Step 1 observation: ERROR: missing_materials {'planks'}
inventory = {'diamond_axe': 1, 'log': 1}
Step 2 successful
Step 3 successful
How to obtain wooden_pressure_plate?
Initial inventory: defaultdict(<class 'int'>, {'diamond_axe': 1})
Step 0 failed: missing_materials {'planks'}
Step 0 observation: 

In [13]:
# parsed_action_step
# action_step
# model.history
import pandas as pd

processed = []
for r in react_results:
    r["group"] = data["obtain_" + r["target"]]["group"]
    r["number_of_steps"] += 1
    processed.append(r)

# react_results
# results[-1]


In [21]:
df = pd.DataFrame(processed)
df.groupby("group").success.mean().to_frame().reset_index()

Unnamed: 0,group,success
0,MT1,1.0
1,MT2,0.75
2,MT3,0.285714
3,MT4,0.428571
4,MT5,0.222222
5,MT6,0.166667
6,MT7,0.076923
7,MT8,0.0


In [24]:
df.groupby("group").tokens_used.mean().to_frame().reset_index()

Unnamed: 0,group,tokens_used
0,MT1,21389.423077
1,MT2,40843.333333
2,MT3,82124.571429
3,MT4,81304.857143
4,MT5,91246.222222
5,MT6,83022.0
6,MT7,84609.0
7,MT8,142862.0


In [43]:
error_cols = df.errors.apply(pd.Series).columns
df.head()

Unnamed: 0,target,question,plan,logs,message_history,errors,success,number_of_steps,number_of_thinking_steps,model,tokens_used,group,missing_materials,missing_tools,insufficient_materials,unknown_item,action_type_mismatch,parsing_error
0,planks,How to obtain planks?,"[{'output': {'log': 1}, 'quantity_needed': 1, ...","Step 0 inventory: defaultdict(<class 'int'>, {...","[{'role': 'user', 'content': 'How to obtain pl...",{},True,2,3,gpt-3.5-turbo,9104,MT1,,,,,,
1,stick,How to obtain stick?,"[{'output': {'log': 1}, 'quantity_needed': 1, ...","Step 0 inventory: defaultdict(<class 'int'>, {...","[{'role': 'user', 'content': 'How to obtain st...",{'missing_materials': 1},True,4,3,gpt-3.5-turbo,12538,MT1,1.0,,,,,
2,wooden_slab,How to obtain wooden_slab?,"[{'output': {'log': 1}, 'quantity_needed': 1, ...","Step 0 inventory: defaultdict(<class 'int'>, {...","[{'role': 'user', 'content': 'How to obtain wo...","{'missing_tools': 4, 'missing_materials': 1, '...",True,18,6,gpt-3.5-turbo,53185,MT1,1.0,4.0,4.0,,,
3,wooden_button,How to obtain wooden_button?,"[{'output': {'log': 2}, 'quantity_needed': 2, ...","Step 0 inventory: defaultdict(<class 'int'>, {...","[{'role': 'user', 'content': 'How to obtain wo...",{},True,3,7,gpt-3.5-turbo,16178,MT1,,,,,,
4,wooden_pressure_plate,How to obtain wooden_pressure_plate?,"[{'output': {'log': 2}, 'quantity_needed': 2, ...","Step 0 inventory: defaultdict(<class 'int'>, {...","[{'role': 'user', 'content': 'How to obtain wo...",{},True,3,7,gpt-3.5-turbo,16089,MT1,,,,,,


In [35]:
# df[df.number_of_steps > 20]
# df.errors.apply(pd.Series)

# df = pd.merge(df, df.errors.apply(pd.Series), left_index=True, right_index=True)

df.groupby("group").agg({err_col: "mean" for err_col in error_cols})

Unnamed: 0,group,missing_materials,missing_tools,insufficient_materials,unknown_item,action_type_mismatch,parsing_error
0,MT1,1.0,1.428571,1.4,1.5,,
1,MT2,2.555556,5.5,2.0,,,1.0
2,MT3,5.2,2.333333,3.5,9.333333,19.0,2.75
3,MT4,2.75,2.615385,2.454545,3.333333,3.0,13.0
4,MT5,5.333333,4.142857,4.0,9.0,,8.666667
5,MT6,3.0,3.2,2.2,,1.0,
6,MT7,5.384615,3.0,2.888889,,1.5,2.666667
7,MT8,,,,10.0,,10.0


In [48]:
# df[df.group == "MT8"].message_history.values[0]
# df

# Analysis

In [151]:
import pandas as pd

df = pd.DataFrame(one_shot_data)
df = df[df.target != "iron_pickaxe"]
df.loc[df.plan_length == 0, "success"] = False

# success_df = df.groupby(["model", "group", "target"]).success.mean().to_frame()
# rotate to have models as columns
# success_df = success_df.unstack()
# success_df.to_csv("out.csv")
# success_df
# error_df[error_df["error"] != ""].to_csv("errors.csv")
# df[df.tokens_used > 0].groupby(["model", "group"]).tokens_used.mean().to_frame()

In [153]:
# df[(df.model == "gpt-4-turbo-preview")&(df.group == "MT1")&(df.error == "")]

df.loc[(df.error == "")&(df.success == False), "error"] = "plan_failed"
df[~df.success].groupby(["model", "group"]).error.value_counts(normalize=True).unstack()

Unnamed: 0_level_0,error,action_type_mismatch,insufficient_materials,missing_materials,missing_tools,plan_failed,unknown_item
model,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gpt-3.5-turbo,MT1,,0.07971,0.210145,0.463768,,0.246377
gpt-3.5-turbo,MT2,0.022059,0.242647,0.213235,0.507353,0.014706,
gpt-3.5-turbo,MT3,0.161905,0.009524,0.114286,0.438095,,0.27619
gpt-3.5-turbo,MT4,0.033333,0.008333,0.025,0.908333,,0.025
gpt-3.5-turbo,MT5,0.051852,0.074074,0.148148,0.474074,,0.251852
gpt-3.5-turbo,MT6,0.144444,0.155556,0.066667,0.611111,0.022222,
gpt-3.5-turbo,MT7,0.425641,0.015385,0.035897,0.425641,0.010256,0.087179
gpt-3.5-turbo,MT8,0.333333,,,0.133333,,0.533333
gpt-4-turbo-preview,MT1,0.025,0.175,,0.775,0.025,
gpt-4-turbo-preview,MT2,0.065217,0.456522,0.043478,0.326087,0.108696,


In [129]:
# df.missing.value_counts()
# average error over error types
# error_df = df.groupby(["model", "group"]).error.value_counts(normalize=True).unstack()
# rotate to have models as columns

# error_df
# df.groupby(["model", "group"]).error.value_counts(normalize=True).unstack()

df[~df.success].groupby(["model", "group"]).error.value_counts(normalize=True).unstack()

Unnamed: 0_level_0,error,action_type_mismatch,insufficient_materials,missing_materials,missing_tools,plan_failed,unknown_item
model,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gpt-3.5-turbo,MT1,,0.07971,0.210145,0.463768,,0.246377
gpt-3.5-turbo,MT2,0.022059,0.242647,0.213235,0.507353,0.014706,
gpt-3.5-turbo,MT3,0.161905,0.009524,0.114286,0.438095,,0.27619
gpt-3.5-turbo,MT4,0.033333,0.008333,0.025,0.908333,,0.025
gpt-3.5-turbo,MT5,0.051852,0.074074,0.148148,0.474074,,0.251852
gpt-3.5-turbo,MT6,0.144444,0.155556,0.066667,0.611111,0.022222,
gpt-3.5-turbo,MT7,0.425641,0.015385,0.035897,0.425641,0.010256,0.087179
gpt-3.5-turbo,MT8,0.333333,,,0.133333,,0.533333
gpt-4-turbo-preview,MT1,0.025,0.175,,0.775,0.025,
gpt-4-turbo-preview,MT2,0.065217,0.456522,0.043478,0.326087,0.108696,


In [None]:
df[df.error == "missing_tools"].missing.value_counts()

In [None]:
df.error.value_counts()
#             {
#                 "role": "assistant",
#                 "content": """def obtain_iron_pickaxe(inventory):
# \tmine({'log': 5}, {'diamond_axe': 1})
# \tcraft({'planks': 13}, {'log': 4})
# \tcraft({'stick': 6}, {'planks': 2})
# \tcraft({'crafting_table': 1}, {'planks': 4})
# \tcraft({'wooden_pickaxe': 1}, {'planks': 3, 'stick': 2, 'crafting_table': 1})
# \tmine({'cobblestone': 11}, {'wooden_pickaxe': 1})
# \tcraft({'stone_pickaxe': 1}, {'cobblestone': 3, 'stick': 2, 'crafting_table': 1})
# \tmine({'iron_ore': 3}, {'stone_pickaxe': 1})
# \tcraft({'furnace': 1}, {'cobblestone': 8})
# \tsmelt({'iron_ingot': 3}, {'iron_ore': 3, 'furnace': 1})
# \tcraft({'iron_pickaxe': 1}, {'iron_ingot': 3, 'stick': 2, 'crafting_table': 1})
# \treturn 'iron_pickaxe'""",

In [None]:
# df[(df.plan_length == df.gold_plan_length)&(df.success == True)]

# !pip install haystack-ai==2.0.0b8

# from haystack.document_stores.in_memory import InMemoryDocumentStore

# document_store = InMemoryDocumentStore()


In [None]:
from baseline_llm import OneShotOpenAILLM, ReactOpenAILLM
def load_openai_key(openai_key_file) -> str:
    with open(openai_key_file, "r") as f:
        context = f.read()
    return context.split("\n")[0]

api_key = load_openai_key("data/openai_keys.txt")
one_shot_llm = OneShotOpenAILLM(api_key)
react_llm = ReactOpenAILLM(api_key)

In [None]:
response = one_shot_llm.generate("How to obtain wooden_slab?")

In [None]:
response.choices[0].message.content.strip()

In [None]:
response_react = react_llm.generate("How to obtain wooden_slab?")

In [None]:
response_react