## Predicting from a Local Image Path

In [1]:
# These are the same packages used for the webcam demo.
# Running this cell is optional if you have already run the installation cell at the top of the notebook.
!pip install git+https://github.com/huggingface/transformers.git \
             git+https://github.com/huggingface/accelerate.git \
             pillow torch einops bitsandbytes -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the model ID for apple/FastVLM-0.5B
MODEL_ID = "apple/FastVLM-0.5B"
# Define the special token index for the image placeholder
IMAGE_TOKEN_INDEX = -200

# Check for GPU availability and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer from the pretrained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load the model with appropriate settings for the available device
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    # Use float16 for GPU acceleration, otherwise float32 for CPU
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
).eval()

print("Model and tokenizer loaded successfully.")

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

llava_qwen.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/FastVLM-0.5B:
- llava_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/100 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


In [3]:
from PIL import Image
import torch

def predict_from_image_path(
    image_path: str,
    prompt_input: str,
    max_new_tokens: int = 128,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.1
):
    """
    Generates a caption for an image located at a local file path.

    Args:
        image_path (str): The path to the local image file.
        prompt_input (str): The text prompt to guide the model's generation.
        max_new_tokens (int, optional): Max tokens to generate. Defaults to 128.
        temperature (float, optional): Sampling temperature. Defaults to 0.7.
        top_p (float, optional): Nucleus sampling probability. Defaults to 0.9.
        top_k (int, optional): Top-k sampling. Defaults to 50.
        repetition_penalty (float, optional): Penalty for repeating tokens. Defaults to 1.1.

    Returns:
        str: The generated caption for the image.
    """
    try:
        # Step 1: Load the image from the specified path and convert to RGB
        image = Image.open(image_path).convert("RGB")

        # Step 2: Prepare the inputs for the model
        # Create the chat message structure with the image placeholder
        messages = [{"role": "user", "content": f"<image>\n{prompt_input}"}]
        rendered = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=False
        )

        # Split the text around the placeholder, tokenize, and reassemble with the image token
        pre, post = rendered.split("<image>", 1)
        pre_ids  = tokenizer(pre,  return_tensors="pt", add_special_tokens=False).input_ids
        post_ids = tokenizer(post, return_tensors="pt", add_special_tokens=False).input_ids
        img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
        attention_mask = torch.ones_like(input_ids, device=model.device)

        # Process the image using the model's vision tower to get pixel values
        pixel_values = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
        pixel_values = pixel_values.to(model.device, dtype=model.dtype)

        # Step 3: Generate the caption using the model
        with torch.no_grad():
            out = model.generate(
                inputs=input_ids,
                attention_mask=attention_mask,
                images=pixel_values,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                repetition_penalty=repetition_penalty,
                do_sample=True if temperature > 0 else False,
            )

        # Decode the generated tokens into a string
        response = tokenizer.decode(out[0], skip_special_tokens=True)

        # Clean the response to only return the assistant's part
        cleaned_response = response.split("assistant")[-1].strip()
        return cleaned_response

    except FileNotFoundError:
        return f"Error: The image file was not found at '{image_path}'."
    except Exception as e:
        return f"An unexpected error occurred: {e}"

In [None]:
from PIL import Image, ImageDraw, ImageFont

# --- Create a dummy image for demonstration ---
# This step is just to make the example self-contained and runnable.
# In a real scenario, you would replace 'dummy_image.png' with your own image path.
try:
    # # Create a blank blue image
    # img = Image.new('RGB', (400, 300), color = (73, 109, 137))
    # d = ImageDraw.Draw(img)

    # # Add some text to the image
    # try:
    #     # Try to load a common font
    #     font = ImageFont.truetype("LiberationSans-Regular.ttf", 20)
    # except IOError:
    #     # If the font is not available, use a basic default one
    #     font = ImageFont.load_default()
    # d.text((50,130), "This is a test image.", fill=(255,255,0), font=font)

    # # Define the path and save the image
    # dummy_image_path = "dummy_image.png"
    # img.save(dummy_image_path)
    # print(f"Created a dummy image at: {dummy_image_path}")

    # --- Example usage of the prediction function ---
    # Define the path to your local image
    image_path = "/content/ללא שם.png"

    # Define the prompt for the model
    prompt = "What does this image show? Describe the color and text."

    # Call the function to get the caption
    caption = predict_from_image_path(image_path, prompt)

    # Print the results
    print("-" * 30)
    print(f"Image Path: {image_path}")
    print(f"Prompt: {prompt}")
    print(f"Generated Caption: {caption}")
    print("-" * 30)

except Exception as e:
    print(f"An error occurred during the example usage: {e}")
    print("This might happen if font files are not available on the system.")
    print("The prediction function itself should still work with your own images.")