## Simple BLIP-2 Demo

In [5]:
from PIL import Image
import requests
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
)
model.to(device)

img_url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 
image = Image.open(requests.get(img_url, stream=True).raw)
prompt = "Question: how many people are in the photo? Answer:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]


two


In [12]:
model

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [90]:
model_parts = {name:m.__class__.__name__ for name,m in model.named_children()}
print(model_parts)

{'vision_model': 'Blip2VisionModel', 'qformer': 'Blip2QFormerModel', 'language_projection': 'Linear', 'language_model': 'OPTForCausalLM'}


In [91]:
target_modules = ['vision_model']

In [96]:
import torch
import torch.nn as nn
from typing import List, Union

def gptq_quantize_layer(layer: nn.Linear, bits: int = 4, block_size: int = 128, percdamp: float = 0.01):
    W = layer.weight.data.clone().float()  # Convert to float32
    orig_dtype = layer.weight.data.dtype  # Store original dtype
    rows, cols = W.shape
    
    H = torch.zeros((cols, cols), device=W.device)
    for i in range(0, rows, block_size):
        W_block = W[i:min(i + block_size, rows), :]
        H.add_(torch.matmul(W_block.t(), W_block))
    
    H.div_(rows)
    H.add_(torch.eye(cols, device=W.device) * percdamp)
    
    # Eigenvalue decomposition for inverse square root
    eigenvalues, eigenvectors = torch.linalg.eigh(H)
    inv_sqrt_eigenvalues = 1 / torch.sqrt(eigenvalues + 1e-8)
    inv_sqrt_H = torch.matmul(
        torch.matmul(eigenvectors, torch.diag(inv_sqrt_eigenvalues)),
        eigenvectors.t()
    )
    
    scale = (2 ** (bits - 1)) - 1
    rounded = torch.round(W * scale) / scale
    
    V = torch.zeros_like(W)
    for i in range(rows):
        w = W[i]
        d = w - rounded[i]
        q = torch.matmul(d, inv_sqrt_H)
        W[i] = rounded[i] + q
        V[i] = d - q
    
    layer.weight.data = W.to(orig_dtype)  # Convert back to original dtype
    return layer

def apply_gptq_to_selected_modules(model: nn.Module, target_modules: List[str], bits: int = 4, apply=None):
    for name, module in model.named_children():
        if (apply is None):
            if name in target_modules:
                print(f"Applying GPTQ to {name} module")
                apply_gptq_to_selected_modules(module, target_modules, bits, True)
            else:
                apply_gptq_to_selected_modules(module, target_modules, bits, False)
        else:
            if isinstance(module, nn.Linear):
                print(f"Found a layer to quantize {name}")
                gptq_quantize_layer(module, bits)
            elif isinstance(module, nn.Module):
                apply_gptq_to_selected_modules(module, target_modules, bits, apply)
    return model

In [None]:
quantized_model = apply_gptq_to_selected_modules(model, target_modules, bits=8)

Applying GPTQ to vision_model module
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a layer to quantize fc1
Found a layer to quantize fc2
Found a layer to quantize qkv
Found a layer to quantize projection
Found a

In [64]:
quantized_model

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((