In [1]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

fatal: destination path '/content/prompting' already exists and is not an empty directory.


In [2]:
import os
os.chdir('/content/prompting')

In [3]:
import sys
sys.path.append('/content/prompting')

In [4]:
!pip install bitsandbytes
!pip install accelerate



In [5]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig
import torch
from PIL import Image
import accelerate

from src.utils.image_util import resize_image

In [6]:
def query(inputs: dict) -> dict:

    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = LlavaNextForConditionalGeneration.from_pretrained(
        "llava-hf/llava-v1.6-vicuna-7b-hf",
        temperature=0.2,
        quantization_config=quantization_config,
        device_map="cuda"
    )

    # Load image
    image = Image.open(inputs["image_path"])

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    prompt = "<image>" + f"USER: {prompt1}\nASSISTANT:"
    input1 = processor(prompt, image, return_tensors="pt").to("cuda")
    outputs = model.generate(**input1, max_new_tokens=1024)

    output1 = processor.decode(outputs[0], skip_special_tokens=True)
    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}   Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    prompt = "<image>" + output1 + f"\n\nUSER: {prompt2}\nASSISTANT:"
    input2 = processor(prompt, image, return_tensors="pt").to("cuda")
    outputs = model.generate(**input2, max_new_tokens=1024)

    output2 = processor.decode(outputs[0], skip_special_tokens=True)
    print("\n=== OUTPUT 2 ===\n") # for debugging
    print(output2)

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    prompt = "<image>" + output2 + f"\n\nUSER: {prompt3}\nASSISTANT:"
    input3 = processor(prompt, image, return_tensors="pt").to("cuda")
    outputs = model.generate(**input3, max_new_tokens=1024)
    output3 = processor.decode(outputs[0], skip_special_tokens=True)

    return {"bot_inst": output3}

In [7]:
# Robot commands available
bot_commands = """
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)
"""

In [8]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
image_path = r"images/browndoor_knob_pull.jpg"
# image_path = r"images/labdoor_straighthandle_pull.jpg"
# image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [9]:
resize_image(image_path, image_path)

In [10]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

Enter the task to be performed: open the door


In [11]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


=== OUTPUT 1 ===

USER: 
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    
ASSISTANT: Observation - The door is closed and locked.

Instruction - To open the door, you would need to unlock it first.

Observation - The door has a keyhole.

Instruction - Locate the keyhole and insert the key to unlock the door.

Observation - The door has a visible lock mechanism.

Instruction - Once the door is unlocked, you can open it by pushing or pulling it open.

Observation - The door is made of wood and has a natural finish.

Instruction - Be gentle when opening the door to avoid damaging it.

Observation - The door is closed and there is no indication of it being open.

Instruction - If the door is not open, you would need to find the key to unlock it.

Observation - The keyhole is located on the t

In [12]:
print("\n==========\n")
print(result["bot_inst"])



USER: 
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    
ASSISTANT: Observation - The door is closed and locked.

Instruction - To open the door, you would need to unlock it first.

Observation - The door has a keyhole.

Instruction - Locate the keyhole and insert the key to unlock the door.

Observation - The door has a visible lock mechanism.

Instruction - Once the door is unlocked, you can open it by pushing or pulling it open.

Observation - The door is made of wood and has a natural finish.

Instruction - Be gentle when opening the door to avoid damaging it.

Observation - The door is closed and there is no indication of it being open.

Instruction - If the door is not open, you would need to find the key to unlock it.

Observation - The keyhole is located on the top half of the do