In [3]:
!pip install gradio transformers accelerate --upgrade

Collecting gradio
  Downloading gradio-5.29.1-py3-none-any.whl.metadata (16 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_

In [8]:
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "HuggingFaceTB/SmolVLM-Instruct"
# Load model and processor outside the function for efficiency
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
).to(DEVICE)
def image_captioning(image_input, prompt):
    """Image Captioning Function."""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image_input], return_tensors="pt").to(DEVICE)
    prompt_length = inputs['input_ids'].shape[1]
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=200
    )
    generated_text = processor.decode(
        generated_ids[0][prompt_length:],
        skip_special_tokens=True
    )
    return generated_text
with gr.Blocks() as demo:
    gr.Markdown("# SmolVLM Demo")
    image_input = gr.Image(label="Input Image")
    prompt_image = gr.Textbox(label="Prompt", value="Describe this image.")
    caption_output = gr.Textbox(label="Caption Output")
    image_button = gr.Button("Run")
    image_button.click(
        image_captioning,
        inputs=[image_input, prompt_image],
        outputs=[caption_output],
    )
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://74d6f4dfa94bbdc21a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




***Version 2 of smolvlm with gradio ***

In [2]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager"
).to(DEVICE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from PIL import Image
import gradio as gr

def smolvlm_ocr(image: Image.Image, prompt_text: str = "What is in this image?"):
    try:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": prompt_text}
                ]
            }
        ]

        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)

        with torch.inference_mode():
            generated_ids = model.generate(**inputs, max_new_tokens=300)
            output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return output_text
    except Exception as e:
        return f"Error: {e}"


In [4]:
gr.Interface(
    fn=smolvlm_ocr,
    inputs=[
        gr.Image(type="pil", label="Upload an Invoice or Handwritten Image"),
        gr.Textbox(label="Prompt (optional)", value="Describe the contents of this image.")
    ],
    outputs=gr.Textbox(label="OCR Output"),
    title="SmolVLM OCR / Image Captioning"
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f07f2653cbb6a4e55f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


