In [1]:
import json
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel
import torch

model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

texts = ["a photo of 2 cats", "a photo of 2 dogs"]
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image) # these are the probabilities
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")

  from .autonotebook import tqdm as notebook_tqdm


40.5% that image 0 is 'a photo of 2 cats'


In [3]:
# processor.tokenizer.encode()
from plancraft.train.dataset import get_dataset_and_collate
dataset, val_dataset, collate_fn = get_dataset_and_collate("idefics2", 16000, 1000, "oa")

  import distutils.spawn
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.


Loading dialogue dataset
Loading images
Loading dialogue dataset
Loading images


In [5]:
dataset[0]

([{'role': 'system',
   'content': [{'text': 'You are crafting in Minecraft. You need to decide on the next action.\n\nYou must output an action like the following:\nact: move from slot X to slot Y with quantity Z\n\nThere are two types of actions\n- move\n- smelt\n\nThe first 10 slots in the inventory are reserved for crafting and correspond to the minecraft crafting table. \n\n[1, 2, 3] \n[4, 5, 6] -> [0]\n[7, 8, 9]\n\nThe crafting matrix is a 3x3 grid, and the output is sent to slot 0.\nYou cannot move or smelt items into output slot 0.\nThe remaining slots (10-45) are for storing items.\n',
     'type': 'text'}]},
  {'role': 'user',
   'content': [{'type': 'image'},
    {'text': 'Craft an item of type: mojang_banner_pattern', 'type': 'text'}]},
  {'role': 'assistant',
   'content': [{'text': 'act: move from slot 22 to slot 1 with quantity 1',
     'type': 'text'}]},
  {'role': 'user',
   'content': [{'type': 'image'},
    {'text': 'Craft an item of type: mojang_banner_pattern', 'ty

In [24]:
# inventory = [
#     {"slot": 13, "type": "stick", "quantity": 2},
#     {"slot": 20, "type": "acacia_log", "quantity": 1},
#     {"slot": 43, "type": "dead_fire_coral", "quantity": 55},
#     {"slot": 27, "type": "acacia_leaves", "quantity": 11},
#     {"slot": 28, "type": "brown_mushroom", "quantity": 23},
#     {"slot": 14, "type": "llama_spawn_egg", "quantity": 22},
#     {"slot": 45, "type": "bat_spawn_egg", "quantity": 6},
#     {"slot": 23, "type": "oak_leaves", "quantity": 8},
#     {"slot": 34, "type": "diorite_slab", "quantity": 38},
#     {"slot": 22, "type": "dark_prismarine_slab", "quantity": 54},
# ]

class TypeEmbedding(nn.Module):
    def __init__(self, model=AutoModel, tokenizer=AutoTokenizer):
        super(TypeEmbedding, self).__init__()
        self.embedding_dim = model.config.hidden_size
        self.learnable_params = nn.Parameter(torch.randn(self.embedding_dim))
        self.model = model
        self.tokenizer = tokenizer
        self.cache = {}

    def forward(self, object_types: list[str]):
        batch, new_types = ([], [])
        for object_type in object_types:
            if object_type not in self.cache:
                batch.append(object_type)
                new_types.append(object_type)
        if len(new_types) > 0:
            inputs = self.tokenizer(new_types, return_tensors="pt", padding=True)
            inputs = {k: v.cuda() for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            for i, object_type in enumerate(new_types):
                type_embedding = outputs.last_hidden_state[i].mean(dim=0)
                self.cache[object_type] = type_embedding
        embeddings = [
            self.cache[object_type] + self.learnable_params
            for object_type in object_types
        ]
        return torch.stack(embeddings)

class InventoryEncoder(nn.Module):
    def __init__(
        self,
        model=AutoModel,
        tokenizer=AutoTokenizer,
        max_quantity=64,
        max_slot=46,
    ):
        super(InventoryEncoder, self).__init__()
        hidden_size = model.config.hidden_size
        self.type_embedding = TypeEmbedding(model, tokenizer)
        self.quantity_embedding = nn.Embedding(max_quantity, hidden_size)
        self.slot_embedding = nn.Embedding(max_slot, hidden_size)
        self.combine = nn.Linear(
            hidden_size * 3,
            hidden_size,
        )

    def forward(self, inventory: list[dict]):
        type_embeddings = self.type_embedding([item["type"] for item in inventory])
        quantities = torch.tensor(
            [item["quantity"] for item in inventory], dtype=torch.long
        )
        slots = torch.tensor([item["slot"] for item in inventory], dtype=torch.long)

        quantities = quantities.cuda()
        slots = slots.cuda()

        quantity_embeddings = self.quantity_embedding(quantities)
        slot_embeddings = self.slot_embedding(slots)
        x_concat = torch.cat(
            [type_embeddings, quantity_embeddings, slot_embeddings], dim=-1
        )
        embed = self.combine(x_concat).mean(dim=0)
        return embed


encoder = InventoryEncoder(model, tokenizer)
encoder = encoder.cuda()

In [31]:
class InventoryGenerator(nn.Module):
    def __init__(
        self,
        model=AutoModel,
        tokenizer=AutoTokenizer,
        max_quantity=64,
        max_slot=46,
    ):
        super(InventoryGenerator, self).__init__()
        hidden_size = model.config.hidden_size
        self.max_quantity = max_quantity
        self.max_slot = max_slot
        self.hidden_size = hidden_size

        self.fc = nn.Linear(hidden_size, hidden_size * 3)
        self.type_decoder = TypeEmbedding(model, tokenizer)
        self.quantity_decoder = nn.Linear(hidden_size, max_quantity)
        self.slot_decoder = nn.Linear(hidden_size, max_slot)

    def forward(self, inventory_embedding):
        x = self.fc(inventory_embedding)

        type_embeds, quantity_embeds, slot_embeds = torch.split(
            x, self.hidden_size, dim=-1
        )

        # Decode type embeddings
        decoded_types = self.type_decoder.decode(type_embeds)

        # Decode quantity and slot embeddings
        quantities = self.quantity_decoder(quantity_embeds)
        slots = self.slot_decoder(slot_embeds)

        # Convert logits to indices
        quantities = torch.argmax(quantities, dim=-1)
        slots = torch.argmax(slots, dim=-1)

        # Create the decoded inventory list
        decoded_inventory = []
        for obj_type, quantity, slot in zip(decoded_types, quantities, slots):
            decoded_inventory.append(
                {
                    "type": obj_type,
                    "quantity": quantity.item(),
                    "slot": slot.item(),
                }
            )

        return decoded_inventory


# Example of how to use the InventoryEmbedding and InventoryGenerator
class InventoryAutoencoder(nn.Module):
    def __init__(
        self,
        model=AutoModel,
        tokenizer=AutoTokenizer,
        max_quantity=64,
        max_slot=46,
    ):
        super(InventoryAutoencoder, self).__init__()
        self.encoder = InventoryEncoder(model, tokenizer, max_quantity, max_slot)
        self.decoder = InventoryGenerator(model, tokenizer, max_quantity, max_slot)

    def forward(self, inventory: list[dict]):
        encoded = self.encoder(inventory)
        decoded = self.decoder(encoded)
        return decoded


# Example usage
inventory = [
    {"slot": 13, "type": "stick", "quantity": 2},
    {"slot": 20, "type": "acacia_log", "quantity": 1},
    {"slot": 43, "type": "dead_fire_coral", "quantity": 55},
    {"slot": 27, "type": "acacia_leaves", "quantity": 11},
    {"slot": 28, "type": "brown_mushroom", "quantity": 23},
    {"slot": 14, "type": "llama_spawn_egg", "quantity": 22},
    {"slot": 45, "type": "bat_spawn_egg", "quantity": 6},
    {"slot": 23, "type": "oak_leaves", "quantity": 8},
    {"slot": 34, "type": "diorite_slab", "quantity": 38},
    {"slot": 22, "type": "dark_prismarine_slab", "quantity": 54},
]

autoencoder = InventoryAutoencoder(model, tokenizer)
autoencoder = autoencoder.to("cuda")
encoded_inventory = autoencoder.encoder(inventory)

In [None]:
autoencoder.decoder(encoded_inventory)


In [8]:
import json

with open("data/train.json") as f:
    data = json.load(f)

In [25]:
encoder(data[0]["slotted_inventory"])


tensor([ 0.5024,  0.7074,  1.2459,  ...,  0.2150, -0.3527, -0.0361],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [None]:
# load data/train.json

# with open('data/train.json') as f:
#     data = json.load(f)