In [1]:
!git clone https://github.com/dcxjn/prompting.git /content/prompting

fatal: destination path '/content/prompting' already exists and is not an empty directory.


In [1]:
import os
os.chdir('/content/prompting')

In [2]:
import sys
sys.path.append('/content/prompting')

In [4]:
!pip install transformers>=4.36.2
!pip install torch>=2.0.1
!pip install torchvision
!pip install bitsandbytes
!pip install accelerate
!pip install sentencepiece

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch

from PIL import Image

from src.utils.image_util import resize_image

In [5]:
def query(inputs: dict) -> dict:

    # Set tokenizer
    tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-VL-6B")

    # Load model
    model_id = AutoModelForCausalLM.from_pretrained(
        "01-ai/Yi-VL-6B",
        low_cpu_mem_usage=True,
        device_map="cuda",
        trust_remote_code=True
    ).eval()

    # Set quantization configuration
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    # Load image
    image = Image.open(inputs['image_path']).convert('RGB')

    # Define the pipe
    pipe = pipeline(
        "image-to-text",
        model=model_id,
        model_kwargs={"quantization_config": quantization_config},
    )

    prompt1 = f"""
    Observe the given image and its details.
    Provide a detailed step-by-step guide on how a human would complete the task of: {inputs["task"]}.
    Link each instruction to an observation in the image in this format: Observation - Instruction.
    """

    messages = [
        {"role": "user", "content": image},
        {"role": "user", "content": prompt1},
    ]

    output1 = pipe(
        image,
        prompt=messages,
        generate_kwargs={"max_new_tokens": 1024}
    )[0]["generated_text"]

    messages.append[
        {"role": "assistant", "content": output1}
    ]

    print("\n=== OUTPUT 1 ===\n") # for debugging
    print(output1)





    prompt2 = f"""
    Imagine you are in control of a robotic arm with the following commands: {inputs["bot_commands"]}
    Given the human instructions you have generated, provide a guide on how the robot would complete the task.
    """



    prompt3 = f"""
    By referencing an observation in the image, ensure each instruction is accurate. Do not make assumptions.
    Check that each instruction is logical.
    """



    return {"bot_inst": output1}


In [6]:
# Robot commands available
bot_commands = """
1. move_to(x, y)
2. grab(object)
3. release(object)
4. push(object)
5. pull(object)
6. rotate(angle)
"""

In [7]:
# image_path = input("Enter the path of the image: ")
# image_path = r"images/fridge_lefthandle.jpg"
# image_path = r"images/housedoor_knob_push.jpg"
# image_path = r"images/browndoor_knob_pull.jpg"
# image_path = r"images/labdoor_straighthandle_pull.jpg"
image_path = r"images/bluedoor_knob_push.jpg"
# image_path = r"images/whitetable.jpg"

In [8]:
resize_image(image_path, image_path)

In [9]:
# Define the task to be performed
task = input("Enter the task to be performed: ")

Enter the task to be performed: open the door


In [10]:
result = query(
    {
        "image_path": image_path,
        "task": task,
        "bot_commands": bot_commands,
    }
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 

In [None]:
print(torch.cuda.is_available())

In [None]:
print("\n==========\n")
print(result["bot_inst"])