In [None]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

In [None]:
import os
os.chdir('/content/prompting')

In [None]:
import sys
sys.path.append('/content/prompting')

In [None]:
!pip install tiktoken
!pip install einops
!pip install accelerate
!pip install xformers
!pip install triton
!pip install sentencepiece

In [None]:
from transformers import AutoModelForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
import torch

import requests
from PIL import Image

from src.utils.image_util import load_image, resize_image

In [None]:
def query(inputs: dict) -> dict:

    # Set tokenizer
    tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        'THUDM/cogvlm-chat-hf',
        do_sample=False,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    ).to('cuda').eval()

    # Load image
    image = Image.open(inputs['image_path']).convert('RGB')

    history = []

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    input1 = model.build_conversation_input_ids(tokenizer, query=prompt1, history=history, images=[image])
    input1 = {
        'input_ids': input1['input_ids'].unsqueeze(0).to('cuda'),
        'token_type_ids': input1['token_type_ids'].unsqueeze(0).to('cuda'),
        'attention_mask': input1['attention_mask'].unsqueeze(0).to('cuda'),
        'images': [[input1['images'][0].to('cuda').to(torch.bfloat16)]],
    }
    gen_kwargs = {"max_length": 2048, "do_sample": False}

    with torch.no_grad():
        outputs = model.generate(**input1, **gen_kwargs)
        outputs = outputs[:, input1['input_ids'].shape[1]:]

    output1 = tokenizer.decode(outputs[0])
    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    input2 = model.build_conversation_input_ids(tokenizer, query=prompt2, history=history, images=None)
    input2 = {
        'input_ids': input2['input_ids'].unsqueeze(0).to('cuda'),
        'token_type_ids': input2['token_type_ids'].unsqueeze(0).to('cuda'),
        'attention_mask': input2['attention_mask'].unsqueeze(0).to('cuda'),
        'images': None,
    }
    gen_kwargs = {"max_length": 2048, "do_sample": False}

    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]

    output2 = tokenizer.decode(outputs[0])
    print("\n=== OUTPUT 2 ===\n") # for debugging
    print(output2)

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    input3 = model.build_conversation_input_ids(tokenizer, query=prompt3, history=history, images=None)
    input3 = {
        'input_ids': input3['input_ids'].unsqueeze(0).to('cuda'),
        'token_type_ids': input3['token_type_ids'].unsqueeze(0).to('cuda'),
        'attention_mask': input3['attention_mask'].unsqueeze(0).to('cuda'),
        'images': None,
    }
    gen_kwargs = {"max_length": 2048, "do_sample": False}

    with torch.no_grad():
        outputs = model.generate(**input3, **gen_kwargs)
        outputs = outputs[:, input3['input_ids'].shape[1]:]

    output3 = tokenizer.decode(outputs[0])

    return {"bot_inst": output3}


In [None]:
# Robot commands available
bot_commands = """
1. move_to(x, y)
2. grab(object)
3. release(object)
4. push(object)
5. pull(object)
6. rotate(angle)
"""

In [None]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
# image_path = r"images/labdoor_straighthandle_pull.jpg"
image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [None]:
resize_image(image_path, image_path)

In [None]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

In [None]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

In [None]:
print("\n==========\n")
print(result["bot_inst"])