In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
import requests
from PIL import Image
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import bitsandbytes

In [None]:
# !pip install torch transformers psutil
# !pip install bitsandbytes

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
import psutil



# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model path
model_path = "ibm-granite/granite-vision-3.2-2b"

# Load processor
processor = AutoProcessor.from_pretrained(model_path)

# Load model in 8-bit
model = AutoModelForVision2Seq.from_pretrained(
    model_path,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True
)

# Print GPU memory footprint (if available)
def get_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        print(f"🟢 GPU Memory Allocated: {allocated:.2f} MB")
        print(f"🟡 GPU Memory Reserved : {reserved:.2f} MB")
    else:
        print("🔴 CUDA not available.")

# Print CPU memory usage
def get_cpu_memory():
    mem = psutil.virtual_memory()
    print(f"🧠 CPU Memory Used : {mem.used / 1024**2:.2f} MB")
    print(f"🧠 CPU Total Memory: {mem.total / 1024**2:.2f} MB")

# Display memory usage
print("\n=== Memory Footprint ===\n")
get_gpu_memory()
get_cpu_memory()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


=== Memory Footprint ===

🟢 GPU Memory Allocated: 2966.77 MB
🟡 GPU Memory Reserved : 3008.00 MB
🧠 CPU Memory Used : 3081.58 MB
🧠 CPU Total Memory: 12978.96 MB


In [None]:
# Display model memory footprint if available
if hasattr(model, "get_memory_footprint"):
    footprint_bytes = model.get_memory_footprint()
    print(f"Model memory footprint: {footprint_bytes / 1e9:.2f} GB")
else:
    print("Model does not support get_memory_footprint().")

# Print GPU memory stats (if running on GPU)
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(device)
    reserved = torch.cuda.memory_reserved(device)
    print("GPU Memory Stats:")
    print(f"  Allocated: {allocated / 1e9:.2f} GB")
    print(f"  Reserved:  {reserved / 1e9:.2f} GB")

Model memory footprint: 2.37 GB
GPU Memory Stats:
  Allocated: 2.44 GB
  Reserved:  2.56 GB


In [None]:
import os
os.environ["FLASH_ATTENTION_2_ENABLED"] = "0"

# Force eager mode globally
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)


In [None]:
# pip install transformers==4.47.0
# #

In [None]:
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    num_crops=4
)

image_url = "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg"
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

# Prepare the input
placeholder = "<|image_1|>\n"  # Placeholder for the image tag
messages = [
    {"role": "user", "content": placeholder + "Summarize the deck of slides."},
]

# Generate prompt
prompt = processor.tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Process inputs
inputs = processor(prompt, images=[image], return_tensors="pt").to("cuda:0")

# Generate response
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    do_sample=False
)

# Trim input tokens
generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]

# Decode response
response = processor.batch_decode(
    generate_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)[0]

print("\n=== Model Output ===\n")
print(response)


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48



=== Model Output ===

The presentation is about Microsoft Azure, focusing on its three types: Hyper-scale, Enterprise, and Hybrid. The presenter is Dinesh Kumar Wickramasinghe, a Senior Software Engineer from CMS Private Limited in Sri Lanka. The design is clean and professional, with a blue color scheme and hexagonal shapes. the word 'Hyper-scale' in the image. the word 'Hybrid' in the image. the word 'Hybrid' in the image.


In [None]:
print("\nCleaning up...")
del model
del processor
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Model and processor removed from memory. GPU cache cleared (if applicable).")


Cleaning up...
Model and processor removed from memory. GPU cache cleared (if applicable).
