In [None]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

In [None]:
import os
os.chdir('/content/prompting')

In [None]:
import sys
sys.path.append('/content/prompting')

In [None]:
!pip install torch>=2.0.0
!pip install transformers>=4.32.0
!pip install auto-gptq
!pip install torchvision
!pip install accelerate
!pip install tiktoken
!pip install einops
!pip install transformers_stream_generator>=0.0.4

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from src.utils.image_util import resize_image

In [None]:
def query(inputs: dict) -> dict:

    # Set tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)

    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="cuda", trust_remote_code=True).eval()
    # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="cuda", trust_remote_code=True).eval()

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    query = tokenizer.from_list_format([
        {'image': inputs['image_path']},
        {'text': prompt1},
    ])

    output1, history = model.chat(tokenizer, query=query, history=None)
    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    output2, history = model.chat(tokenizer, prompt2, history=history)
    print("\n=== OUTPUT 2 ===\n") # for debugging
    print(output2)

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    output3, history = model.chat(tokenizer, prompt3, history=history)

    return {"bot_inst": output3}


In [None]:
# Robot commands available
bot_commands = """
1. move_to(x, y)
2. grab(object)
3. release(object)
4. push(object)
5. pull(object)
6. rotate(angle)
"""

In [None]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
# image_path = r"images/labdoor_straighthandle_pull.jpg"
image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [None]:
resize_image(image_path, image_path)

In [None]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

In [None]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

In [None]:
print("\n==========\n")
print(result["bot_inst"])