In [None]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

In [None]:
import os
os.chdir('/content/prompting')

In [None]:
import sys
sys.path.append('/content/prompting')

In [None]:
!pip install bitsandbytes
!pip install accelerate

In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests
import accelerate

from src.utils.image_util import resize_image

In [None]:
def query(inputs: dict) -> dict:

    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = LlavaNextForConditionalGeneration.from_pretrained(
        "llava-hf/llava-v1.6-vicuna-7b-hf", 
        quantization_config=quantization_config, 
        device_map="auto"
    )

    # prepare image and text prompt, using the appropriate prompt template
    url = inputs["image_path"]
    image = Image.open(requests.get(url, stream=True).raw)

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    prompt = "USER: <image>\n{prompt1} ASSISTANT:"
    inputs = processor(prompt, image, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=1024)

    output1 = processor.decode(outputs[0], skip_special_tokens=True)
    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)

    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """

    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """

    return {"bot_inst": output1[0]["generated_text"]}

In [None]:
# Robot commands available
bot_commands = """
    1. move_to(x, y)
    2. grab(object)
    3. release(object)
    4. push(object)
    5. pull(object)
    6. rotate(angle)
"""

In [None]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
image_path = r"images/labdoor_straighthandle_pull.jpg"
# image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [None]:
resize_image(image_path, image_path)

In [None]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

In [None]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

In [None]:
print("\n==========\n")
print(result["bot_inst"])