In [1]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

Cloning into '/content/prompting'...
remote: Enumerating objects: 533, done.[K
remote: Counting objects: 100% (533/533), done.[K
remote: Compressing objects: 100% (404/404), done.[K
remote: Total 533 (delta 158), reused 473 (delta 110), pack-reused 0[K
Receiving objects: 100% (533/533), 9.28 MiB | 9.09 MiB/s, done.
Resolving deltas: 100% (158/158), done.


In [2]:
import os
os.chdir('/content/prompting')

In [3]:
import sys
sys.path.append('/content/prompting')

In [4]:
!pip install bitsandbytes
!pip install accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41

In [12]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig
import torch
from PIL import Image
import accelerate

from src.utils.image_util import resize_image

In [27]:
def query(inputs: dict) -> dict:

    # Load processor
    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")

    # Set quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load model
    model = LlavaNextForConditionalGeneration.from_pretrained(
        "llava-hf/llava-v1.6-vicuna-7b-hf",
        quantization_config=quantization_config,
        device_map="cuda"
    )

    # Load image
    image = Image.open(inputs["image_path"])

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    prompt = "USER: <image>\n" + prompt1 + "ASSISTANT:"
    input1 = processor(prompt, image, return_tensors="pt").to("cuda")
    outputs = model.generate(**input1, max_new_tokens=2048)

    output1 = processor.decode(outputs[0], skip_special_tokens=True)
    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    return {"bot_inst": output1}

In [7]:
# Robot commands available
bot_commands = """
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)
"""

In [8]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
image_path = r"images/labdoor_straighthandle_pull.jpg"
# image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [9]:
resize_image(image_path, image_path)

In [10]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

Enter the task to be performed: open the door


In [28]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


=== OUTPUT 1 ===

USER: 

    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    ASSISTANT: Observ


In [29]:
print("\n==========\n")
print(result["bot_inst"])



USER: 

    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: open the door.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    ASSISTANT: Observ
