In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install torch
!pip install accelerate



In [2]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

fatal: destination path '/content/prompting' already exists and is not an empty directory.


In [3]:
import os
os.chdir('/content/prompting')

In [4]:
import sys
sys.path.append('/content/prompting')

In [5]:
from transformers import pipeline, BitsAndBytesConfig
import torch
import accelerate

from src.utils.image_util import load_image, resize_image

In [13]:
def query(inputs: dict) -> dict:
        
    model_id = "llava-hf/llava-v1.6-vicuna-13b-hf"

    # Configure BitsAndBytesConfig based on GPU availability
    if torch.cuda.is_available():
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
        )
    else:
        quantization_config = None  # or configure for CPU if necessary

    # Initialize pipeline based on GPU availability
    if torch.cuda.is_available():
        pipe = pipeline(
            "image-to-text",
            model=model_id,
            model_kwargs={"quantization_config": quantization_config},
            # device=0,  # commented out for Google Colab
        )
    else:
        pipe = pipeline(
            "image-to-text",
            model=model_id,
            model_kwargs={"quantization_config": quantization_config},
        )

    image = inputs["image"]

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: "Observation - Instruction".
    """

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    user_prompt1 = "USER: <image>\n" + prompt1 + "​\nASSISTANT: "

    output1 = pipe(
        image, prompt=user_prompt1, generate_kwargs={"max_new_tokens": 4096}
    )

    print("\n=== OUTPUT 1 ===\n")
    print(output1[0]["generated_text"])

    user_prompt2 = (
        user_prompt1
        + output1[0]["generated_text"]
        + "\nUSER: "
        + prompt2
        + "​\nASSISTANT: "
    )

    output2 = pipe(
        image, prompt=user_prompt2, generate_kwargs={"max_new_tokens": 4096}
    )

    print("\n=== OUTPUT 2 ===\n")
    print(output2[0]["generated_text"])

    user_prompt3 = (
        user_prompt2
        + output2[0]["generated_text"]
        + "\nUSER: "
        + prompt3
        + "​\nASSISTANT: "
    )

    output3 = pipe(
        image, prompt=user_prompt3, generate_kwargs={"max_new_tokens": 4096}
    )

    return {"bot_inst": output3[0]["generated_text"]}

In [7]:
# Robot commands available
bot_commands = """
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)
"""

In [8]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images\fridge_lefthandle.jpg"
# image_path = r"images\housedoor_knob_push.jpg"
# image_path = r"images\browndoor_knob_pull.jpg"
# image_path = r"images\labdoor_straighthandle_pull.jpg"
image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images\whitetable.jpg"

In [9]:
resize_image(image_path, image_path)
image = load_image({"image_path": image_path})["image"]

In [10]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

Enter the task to be performed: open the door


In [14]:
result = query(
    {
        "image": image,
        "task": task,
        "bot_commands": bot_commands,
    }
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



=== OUTPUT 1 ===

USER:  

        Observe the given image and its details.
        Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
        Link each instruction to an observation in the image in this format: "Observation - Instruction".
        ​
ASSISTANT: 1. Observe the given image and its details.
2. Identify the blue door with a knob and a lock.
3. Locate the knob on the door.
4. Gently turn the knob to the left.
5. As the knob turns, the lock mechanism will engage, and the door will begin to open.
6. Once the door is open, continue to hold the knob to keep it open.
7. If the door is heavy or requires additional support, use the door frame or a wall to provide additional support.
8. If the door is particularly heavy, consider using a door stop or a door wedge to prevent it from closing.

These instructions are based on the image and the given context.

=== OUTPUT 2 ===

USER:  

        Observe the given image and its details.
     

In [12]:
print("\n==========\n")
print(result["bot_inst"])



USER:  

        Observe the given image and its details.
        Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
        Link each instruction to an observation in the image in this format: "Observation: Instruction".
        ​
ASSISTANT: USER:  

        Observe the given image and its details.
        Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
        Link each instruction to an observation in the image in this format: "Observation: Instruction".
        ​
ASSISTANT: 1. Observing the image, we can see that the door is blue and has a knob on it.
2. To open the door, one would first need to find the knob on the door.
3. Once found, the person would grip the knob firmly and turn it in the desired direction.
4. As the knob is turned, the door will begin to open.
5. The person should continue to turn the knob until the door is fully open.
6. After the door is open, the person can then p