In [1]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig
from PIL import Image
import requests

In [2]:
CHECK_POINT = "Salesforce/instructblip-vicuna-13b"

In [3]:
config = InstructBlipConfig.from_pretrained(CHECK_POINT)
with init_empty_weights():
    model = InstructBlipForConditionalGeneration(config)



In [4]:
model.tie_weights()
device_map = infer_auto_device_map(model, no_split_module_classes=["LlamaDecoderLayer", "VisionTransformer"], dtype="float32", max_memory={0: "10GiB", 1: "10GiB", 2: "10Gib", 3: "10Gib", 4: "10Gib", 5: "10Gib", 6: "10Gib", 7: "10Gib"})

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [5]:
device_map

OrderedDict([('query_tokens', 0),
             ('vision_model', 0),
             ('qformer', 0),
             ('language_projection', 0),
             ('language_model.model.embed_tokens', 0),
             ('language_model.model.layers.0', 0),
             ('language_model.model.layers.1', 0),
             ('language_model.model.layers.2', 0),
             ('language_model.model.layers.3', 1),
             ('language_model.model.layers.4', 1),
             ('language_model.model.layers.5', 1),
             ('language_model.model.layers.6', 1),
             ('language_model.model.layers.7', 1),
             ('language_model.model.layers.8', 1),
             ('language_model.model.layers.9', 1),
             ('language_model.model.layers.10', 1),
             ('language_model.model.layers.11', 2),
             ('language_model.model.layers.12', 2),
             ('language_model.model.layers.13', 2),
             ('language_model.model.layers.14', 2),
             ('language_model.model.l

In [6]:
device_map['language_model.lm_head'] = device_map['language_model.model.embed_tokens']

In [7]:
device_map

OrderedDict([('query_tokens', 0),
             ('vision_model', 0),
             ('qformer', 0),
             ('language_projection', 0),
             ('language_model.model.embed_tokens', 0),
             ('language_model.model.layers.0', 0),
             ('language_model.model.layers.1', 0),
             ('language_model.model.layers.2', 0),
             ('language_model.model.layers.3', 1),
             ('language_model.model.layers.4', 1),
             ('language_model.model.layers.5', 1),
             ('language_model.model.layers.6', 1),
             ('language_model.model.layers.7', 1),
             ('language_model.model.layers.8', 1),
             ('language_model.model.layers.9', 1),
             ('language_model.model.layers.10', 1),
             ('language_model.model.layers.11', 2),
             ('language_model.model.layers.12', 2),
             ('language_model.model.layers.13', 2),
             ('language_model.model.layers.14', 2),
             ('language_model.model.l

In [8]:
model = InstructBlipForConditionalGeneration.from_pretrained(CHECK_POINT, device_map=device_map)
processor = InstructBlipProcessor.from_pretrained(CHECK_POINT)

url = "https://gker-love.oss-cn-beijing.aliyuncs.com/Naive/messages/6e6c01ed-29bb-447d-8790-4f068d0b6e8a/da6a1872-5d75-478d-a5ac-8e5e24864df4.jpeg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "What do you see in the image?"
inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=256,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
)
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
print(generated_text)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]



In the image, there is a large brick building with a clock tower on top of it. The building is situated in the middle of a city street, surrounded by trees and other buildings. There is also a car parked on the side of the road near the building.
