In [11]:
# !pip install flash-attn --no-build-isolation
# !pip install flash-attn==1.0.9

Collecting flash-attn==1.0.9
  Downloading flash_attn-1.0.9.tar.gz (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.0/1.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from flash-attn==1.0.9)
  Using cached ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Using cached ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 

In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cuda


In [6]:
# Load images
image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")

In [12]:
# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    torch_dtype=torch.bfloat16,
    # _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

# RuntimeError: FlashAttention only supports Ampere GPUs or newer.

In [13]:
# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Can you describe this image?"}
        ]
    },
]

In [14]:
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

In [15]:
# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

In [16]:
generated_texts

["User:\n\n\n\nCan you describe this image?\nAssistant: The image depicts a panoramic view of a cityscape, specifically focusing on the iconic Statue of Liberty, a prominent symbol of freedom and democracy. The statue is situated on Liberty Island, which is located in the heart of New York City. The statue is a replica of the original Liberty Bell, which was the first Liberty Bell and was used as a symbol of freedom in the United States.\n\nThe statue is characterized by its green, flowing hair and the crown atop its head, which is a replica of the crown of Queen Victoria. The statue is mounted on a pedestal that is surrounded by water, with a small strip of land extending into the water. The water is calm, reflecting the sky and the statue, creating a serene and picturesque scene.\n\nIn the background, the cityscape is dominated by numerous skyscrapers, which are typical of New York City. The buildings are of various heights and architectural styles, showcasing the city's diverse and 

In [17]:
print(generated_texts[0])

User:



Can you describe this image?
Assistant: The image depicts a panoramic view of a cityscape, specifically focusing on the iconic Statue of Liberty, a prominent symbol of freedom and democracy. The statue is situated on Liberty Island, which is located in the heart of New York City. The statue is a replica of the original Liberty Bell, which was the first Liberty Bell and was used as a symbol of freedom in the United States.

The statue is characterized by its green, flowing hair and the crown atop its head, which is a replica of the crown of Queen Victoria. The statue is mounted on a pedestal that is surrounded by water, with a small strip of land extending into the water. The water is calm, reflecting the sky and the statue, creating a serene and picturesque scene.

In the background, the cityscape is dominated by numerous skyscrapers, which are typical of New York City. The buildings are of various heights and architectural styles, showcasing the city's diverse and modern skyl