In [1]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

Cloning into '/content/prompting'...
remote: Enumerating objects: 283, done.[K
remote: Counting objects: 100% (283/283), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 283 (delta 88), reused 241 (delta 56), pack-reused 0[K
Receiving objects: 100% (283/283), 2.11 MiB | 6.89 MiB/s, done.
Resolving deltas: 100% (88/88), done.


In [2]:
import os
os.chdir('/content/prompting')

In [3]:
import sys
sys.path.append('/content/prompting')

In [4]:
!pip install bitsandbytes
!pip install accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41

In [5]:
from transformers import pipeline, BitsAndBytesConfig
import torch
import accelerate

from src.utils.image_util import load_image, resize_image

In [6]:
def query(inputs: dict) -> dict:

    # model_id = "llava-hf/llava-v1.6-vicuna-13b-hf"
    model_id = "llava-hf/llava-1.5-7b-hf"

    # Configure BitsAndBytesConfig based on GPU availability
    if torch.cuda.is_available():
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
        )
    else:
        quantization_config = None  # or configure for CPU if necessary

    # Initialize pipeline based on GPU availability
    if torch.cuda.is_available():
        pipe = pipeline(
            "image-to-text",
            model=model_id,
            model_kwargs={"quantization_config": quantization_config},
            # device=0,  # commented out for Google Colab
        )
    else:
        pipe = pipeline(
            "image-to-text",
            model=model_id,
            model_kwargs={"quantization_config": quantization_config},
        )

    image = inputs["image"]

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    user_prompt1 = "<image>\n" + "USER: " + prompt1 + "​\nASSISTANT: "

    output1 = pipe(
        image, prompt=user_prompt1, generate_kwargs={"max_new_tokens": 1024}
    )

    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1[0]["generated_text"])

    user_prompt2 = (
        "<image>\n"
        + output1[0]["generated_text"]
        + "\n\nUSER: "
        + prompt2
        + "​\nASSISTANT: "
    )

    output2 = pipe(
        image, prompt=user_prompt2, generate_kwargs={"max_new_tokens": 1024}
    )

    print("\n=== OUTPUT 2 ===\n") # for debugging
    print(output2[0]["generated_text"])

    user_prompt3 = (
        "<image>\n"
        + output2[0]["generated_text"]
        + "\n\nUSER: "
        + prompt3
        + "​\nASSISTANT: "
    )

    output3 = pipe(
        image, prompt=user_prompt3, generate_kwargs={"max_new_tokens": 1024}
    )

    return {"bot_inst": output3[0]["generated_text"]}

In [7]:
# Robot commands available
bot_commands = """
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)
"""

In [36]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
image_path = r"images/labdoor_straighthandle_pull.jpg"
# image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [37]:
resize_image(image_path, image_path)
image = load_image({"image_path": image_path})["image"]

In [38]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

Enter the task to be performed: open the door


In [39]:
result = query(
    {
        "image": image,
        "task": task,
        "bot_commands": bot_commands,
    }
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



=== OUTPUT 1 ===


USER: 
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    ​
ASSISTANT: 1. The door is closed.
2. The handle of the door is located on the right side.
3. The door is made of metal.
4. The door is gray in color.
5. The door is located in a room.

Instructions:

1. To open the door, first, locate the handle on the right side of the door.
2. Grip the handle firmly and pull it towards you.
3. As you pull, the door should open, revealing the room inside.

Observations:

1. The door is closed.
2. The handle of the door is located on the right side.
3. The door is made of metal.
4. The door is gray in color.
5. The door is located in a room.

=== OUTPUT 2 ===



USER: 
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would comp

In [35]:
print("\n==========\n")
print(result["bot_inst"])






USER: 
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    ​
ASSISTANT: 1. Observe the given image and its details.
2. Identify the blue door with a knob and a potted plant next to it.
3. Approach the door and prepare to open it.
4. Observe the knob on the door.
5. Gently turn the knob to the left, which will open the door.
6. As the door opens, observe the interior of the house.

The image shows a blue door with a knob and a potted plant next to it. The door is open, revealing the interior of the house.

USER: 
    Imagine you are in control of a robotic arm with the following commands: 
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)

    Given the human instructions you have generated, provide a guide on how 