In [23]:
from transformers import Blip2ForConditionalGeneration, Blip2Config, Blip2Processor

# Load the configuration
config = Blip2Config.from_pretrained('Salesforce/blip2-flan-t5-xl')

# Optionally, modify the configuration to make it smaller (if the architecture allows)
# This part might require knowledge about the internal configuration parameters of BLIP-2
# Initialize the model with the modified configuration
config.vision_config.num_layers = 1
config.text_config.num_layers = 1
config.vision_config.hidden_size = 32  # Reduce the hidden size
config.vision_config.num_attention_heads = 2  # Reduce the number of attention heads
config.vision_config.num_hidden_layers = 1  # Reduce the number of layers

# For the text encoder
config.text_config.hidden_size = 32  # Reduce the hidden size
config.text_config.num_attention_heads = 2  # Reduce the number of attention heads
config.text_config.num_hidden_layers = 1  # Reduce the number of layers
model = Blip2ForConditionalGeneration(config)
print("Model loaded with random initialization:")
print(model)


Model loaded with random initialization:
Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 32, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0): Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=32, out_features=96, bias=True)
            (projection): Linear(in_features=32, out_features=32, bias=True)
          )
          (layer_norm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=32, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=32, bias=True)
          )
          (layer_norm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernor

In [24]:
tokenizer = Blip2Processor.from_pretrained('Salesforce/blip2-flan-t5-xl')

In [25]:
tokenizer

Blip2Processor:
- image_processor: BlipImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_convert_rgb",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Blip2Processor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

- tokenizer: T5TokenizerFast(name_or_path='Salesforce/blip2-flan-t5-xl', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_

In [27]:
tokenizer(text="Testfs asf asf")

{'input_ids': [2300, 89, 7, 38, 89, 38, 89, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
model

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 32, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0): Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=32, out_features=96, bias=True)
            (projection): Linear(in_features=32, out_features=32, bias=True)
          )
          (layer_norm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=32, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=32, bias=True)
          )
          (layer_norm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((32,), eps=1e-06, elementwi