In [7]:
from PIL import Image  # 导入PIL库中的Image模块，用于图像处理
import requests  # 导入requests库，用于发送HTTP请求
from transformers import Blip2Processor, Blip2ForConditionalGeneration  # 导入transformers库中的BLIP2相关模型
import torch  # 导入PyTorch库

device = "cuda" if torch.cuda.is_available() else "cpu"  # 检查是否有GPU可用，如果有则使用GPU，否则使用CPU

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")  # 加载BLIP2处理器，用于处理输入图像和文本
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16  # 加载BLIP2模型，使用float16精度以节省内存
)


model.to(device)  # 将模型移动到指定设备（GPU或CPU）
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"  # 设置要处理的图像URL
image = Image.open('000000039769.jpg')  # 下载并打开图像
print(processor)  # 打印处理器信息
print('-'*100)  # 打印分隔线
inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)  # 使用处理器处理图像，转换为PyTorch张量并移至设备
print(inputs['pixel_values'].shape)  # 打印处理后的图像张量形状
generated_ids = model.generate(**inputs)  # 使用模型生成文本的token IDs
print('-'*100)  # 打印分隔线
print(model)  # 打印模型结构
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()  # 将生成的token IDs解码为文本
print(generated_text)  # 打印生成的文本

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  41%|####1     | 7.08G/17.1G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Blip2Processor:
- image_processor: BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Blip2Processor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

- tokenizer: GPT2TokenizerFast(name_or_path='Salesforce/blip2-opt-2.7b', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=Fal

In [8]:
from PIL import Image  # 导入PIL库中的Image模块，用于图像处理
import requests  # 导入requests库，用于发送HTTP请求
from transformers import Blip2Processor, Blip2ForConditionalGeneration  # 导入transformers库中的BLIP2相关模型
import torch  # 导入PyTorch库
import types  # 导入types模块，用于方法绑定

device = "cuda" if torch.cuda.is_available() else "cpu"  # 检查是否有GPU可用，如果有则使用GPU，否则使用CPU
image = Image.open('000000039769.jpg')
print(f"Using device: {device}")
# =============================================================================
# 方法2：更安全的Hook方式（推荐）
# =============================================================================
print("\n" + "="*60)
print("Alternative Method: Using Forward Hooks")
print("="*60)
# 加载BLIP2处理器和模型
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")  # 加载BLIP2处理器，用于处理输入图像和文本
def reset_model_and_use_hooks():
    """重新加载模型并使用forward hooks的方式来监控"""
    
    # 重新加载干净的模型
    model_clean = Blip2ForConditionalGeneration.from_pretrained(
        "Salesforce/blip2-opt-2.7b", 
        torch_dtype=torch.float16
    )
    model_clean.to(device)
    
    # 定义hook函数
    def attention_hook(module, input, output):
        if len(input) > 0:
            print(f"Q-former attention input shape: {input[0].shape}")
    
    def crossattention_hook(module, input, output):
        if len(input) > 0:
            print(f"Q-former crossattention input shape: {input[0].shape}")
            if len(input) > 3 and input[3] is not None:  # encoder_hidden_states
                print(f"Encoder hidden states shape: {input[3].shape}")
    
    # 注册forward hooks
    attention_handle = model_clean.qformer.encoder.layer[0].attention.register_forward_hook(attention_hook)
    crossattention_handle = model_clean.qformer.encoder.layer[0].crossattention.register_forward_hook(crossattention_hook)
    
    try:
        print("Using forward hooks method...")
        
        # 处理图像
        inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
        
        # 生成文本
        with torch.no_grad():
            generated_ids = model_clean.generate(**inputs, max_length=50)
        
        # 解码生成的文本
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        print(f"Generated caption (using hooks): {generated_text}")
        
    except Exception as e:
        print(f"Hook method error: {str(e)}")
    
    finally:
        # 清理hooks
        attention_handle.remove()
        crossattention_handle.remove()
        print("Hooks removed successfully")

# 运行hook方法
try:
    reset_model_and_use_hooks()
except Exception as e:
    print(f"Hook method failed: {str(e)}")



Using device: cpu

Alternative Method: Using Forward Hooks


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using forward hooks method...
Q-former attention input shape: torch.Size([1, 32, 768])
Q-former crossattention input shape: torch.Size([1, 32, 768])
Encoder hidden states shape: torch.Size([1, 257, 1408])
Generated caption (using hooks): two cats laying on a couch
Hooks removed successfully


In [None]:
# Blip2ForConditionalGeneration(
#   (vision_model): Blip2VisionModel(
#     (embeddings): Blip2VisionEmbeddings(
#       (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
#     )
#     (encoder): Blip2Encoder(
#       (layers): ModuleList(
#         (0-38): 39 x Blip2EncoderLayer(
#           (self_attn): Blip2Attention(
#             (dropout): Dropout(p=0.0, inplace=False)
#             (qkv): Linear(in_features=1408, out_features=4224, bias=True)
#             (projection): Linear(in_features=1408, out_features=1408, bias=True)
#           )
#           (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
#           (mlp): Blip2MLP(
#             (activation_fn): GELUActivation()
#             (fc1): Linear(in_features=1408, out_features=6144, bias=True)
#             (fc2): Linear(in_features=6144, out_features=1408, bias=True)
#           )
#           (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
#         )
#       )
#     )
#     (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
#   )
#   (qformer): Blip2QFormerModel(
#     (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#     (dropout): Dropout(p=0.1, inplace=False)
#     (encoder): Blip2QFormerEncoder(
#       (layer): ModuleList(
#         (0): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (1): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (2): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (3): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (4): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (5): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (6): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (7): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (8): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (9): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (10): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (crossattention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=1408, out_features=768, bias=True)
#               (value): Linear(in_features=1408, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#         (11): Blip2QFormerLayer(
#           (attention): Blip2QFormerAttention(
#             (attention): Blip2QFormerMultiHeadAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): Blip2QFormerSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate_query): Blip2QFormerIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output_query): Blip2QFormerOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#       )
#     )
#   )
#   (language_projection): Linear(in_features=768, out_features=2560, bias=True)
#   (language_model): OPTForCausalLM(
#     (model): OPTModel(
#       (decoder): OPTDecoder(
#         (embed_tokens): Embedding(50304, 2560, padding_idx=1)
#         (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
#         (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
#         (layers): ModuleList(
#           (0-31): 32 x OPTDecoderLayer(
#             (self_attn): OPTSdpaAttention(
#               (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
#               (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
#               (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
#               (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
#             )
#             (activation_fn): ReLU()
#             (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
#             (fc1): Linear(in_features=2560, out_features=10240, bias=True)
#             (fc2): Linear(in_features=10240, out_features=2560, bias=True)
#             (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
#           )
#         )
#       )
#     )
#     (lm_head): Linear(in_features=2560, out_features=50304, bias=False)
#   )
# )