# BLIP模拟模型搭建理解

In [1]:
import torch  # 导入PyTorch库
import torch.nn as nn  # 导入PyTorch神经网络模块
from transformers import BertModel, ViTModel, BertTokenizer  # 从transformers库导入预训练模型和分词器

class BLIP_MED(nn.Module):  # 定义BLIP多模态编码解码模型
    def __init__(self, config):  # 初始化函数，接收配置参数
        super().__init__()  # 调用父类初始化
        self.image_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224")  # 加载预训练的ViT图像编码器
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")  # 加载预训练的BERT文本编码器
        self.itc_head = nn.Sequential(  # 定义图像-文本对比(ITC)头部
            nn.Linear(config.hidden_size, config.hidden_size),  # 线性层
            nn.LayerNorm(config.hidden_size)  # 层归一化
        )
        self.itm_head = nn.Linear(config.hidden_size, 2)  # 图像-文本匹配(ITM)头部
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)  # 语言模型头部，用于词汇预测
        self.text_decoder = ImageGroundedTextDecoder(config)  # 初始化基于图像的文本解码器

    def forward(self, image, input_ids, attention_mask, decoder_input_ids=None):  # 前向传播函数
        # ViT image features
        image_outputs = self.image_encoder(pixel_values=image)  # 通过ViT编码图像
        image_feat = image_outputs.last_hidden_state  # [B, num_patches+1, C]  # 获取图像特征

        # Text encoding
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)  # 通过BERT编码文本
        text_feat = text_outputs.last_hidden_state  # 获取文本特征
        cls_feat = text_feat[:, 0]  # [CLS] token  # 获取CLS标记的特征
        print(f'cls_feat: {cls_feat.shape}')
        # ITC: use cls_feat and image_feat[CLS]
        image_cls = image_feat[:, 0]  # 获取图像的CLS特征
        itc_score = torch.cosine_similarity(self.itc_head(cls_feat), self.itc_head(image_cls))  # 计算文本和图像特征的余弦相似度

        # ITM,这里是要做交叉注意力后的，不是很准确
        itm_logits = self.itm_head(cls_feat)  # 计算图像-文本匹配分数

        # LM decoding if decoder_input_ids provided
        if decoder_input_ids is not None:  # 如果提供了解码器输入
            lm_outputs = self.text_decoder(decoder_input_ids, image_feat)  # 使用文本解码器生成输出
            lm_logits = self.lm_head(lm_outputs)  # 计算语言模型的词汇预测
        else:
            lm_logits = None  # 否则设置为None

        return {  # 返回结果字典
            "itc_score": itc_score,  # 图像-文本对比分数
            "itm_logits": itm_logits,  # 图像-文本匹配分数
            "lm_logits": lm_logits  # 语言模型预测结果
        }

class ImageGroundedTextDecoder(nn.Module):  # 定义基于图像的文本解码器
    def __init__(self, config):  # 初始化函数
        super().__init__()  # 调用父类初始化
        self.layers = nn.ModuleList([  # 创建Transformer解码器层列表
            TransformerDecoderBlock(config) for _ in range(config.num_hidden_layers)  # 根据配置创建多个解码器块
        ])
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)  # 词嵌入层
        self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)  # 位置嵌入层
        self.layernorm = nn.LayerNorm(config.hidden_size)  # 层归一化

    def forward(self, decoder_input_ids, image_feats):  # 前向传播函数
        bsz, seq_len = decoder_input_ids.shape  # 获取批次大小和序列长度
        pos_ids = torch.arange(0, seq_len, dtype=torch.long, device=decoder_input_ids.device)  # 创建位置ID
        pos_ids = pos_ids.unsqueeze(0).expand_as(decoder_input_ids)  # 扩展位置ID到与输入相同的形状

        x = self.embedding(decoder_input_ids) + self.position_embedding(pos_ids)  # 词嵌入加位置嵌入
        x = self.layernorm(x)  # 应用层归一化

        for layer in self.layers:  # 遍历所有解码器层
            x = layer(x, image_feats)  # 通过每一层处理

        return x  # 返回解码结果

class TransformerDecoderBlock(nn.Module):  # 定义Transformer解码器块
    def __init__(self, config):  # 初始化函数
        super().__init__()  # 调用父类初始化
        self.self_attn = nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)  # 自注意力机制
        self.cross_attn = nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)  # 交叉注意力机制
        self.feed_forward = nn.Sequential(  # 前馈神经网络
            nn.Linear(config.hidden_size, config.intermediate_size),  # 第一个线性层
            nn.GELU(),  # GELU激活函数
            nn.Linear(config.intermediate_size, config.hidden_size)  # 第二个线性层
        )
        self.norm1 = nn.LayerNorm(config.hidden_size)  # 第一个层归一化
        self.norm2 = nn.LayerNorm(config.hidden_size)  # 第二个层归一化
        self.norm3 = nn.LayerNorm(config.hidden_size)  # 第三个层归一化

    def forward(self, x, visual_feats):  # 前向传播函数
        # Causal mask
        seq_len = x.size(1)  # 获取序列长度
        attn_mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()  # 创建因果掩码，look ahead mask

        # 修复自注意力机制
        x2, _ = self.self_attn(x, x, x, attn_mask=attn_mask)  # 应用自注意力
        x = self.norm1(x + x2)  # 残差连接和层归一化

        x2, _ = self.cross_attn(query=x,  # 应用交叉注意力
                               key=visual_feats,  # 使用视觉特征作为键
                               value=visual_feats)  # 使用视觉特征作为值
        x = self.norm2(x + x2)  # 残差连接和层归一化

        x = self.norm3(x + self.feed_forward(x))  # 应用前馈网络，残差连接和层归一化
        return x  # 返回处理后的特征

# ---------- 推理示例 ----------
if __name__ == '__main__':  # 主程序入口
    from transformers import BertConfig  # 导入BERT配置
    config = BertConfig()  # 创建配置对象
    model = BLIP_MED(config)  # 初始化模型
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # 加载分词器

    # 构造输入
    dummy_image = torch.randn(2, 3, 224, 224)  # batch of 2  # 创建随机图像张量
    text = ["a girl holding a kitten", "a man riding a horse"]  # 示例文本
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)  # 对文本进行分词处理

    decoder_input = tokenizer(["a little","a small"], return_tensors="pt", padding=True).input_ids  # 创建解码器输入
    print(decoder_input.shape)
    # 前向推理
    outputs = model(dummy_image, inputs['input_ids'], inputs['attention_mask'], decoder_input)  # 执行模型推理
    print("ITC score:", outputs['itc_score'].shape)  # 打印ITC分数形状
    print("ITM logits:", outputs['itm_logits'].shape)  # 打印ITM逻辑形状
    print("LM logits:", outputs['lm_logits'].shape if outputs['lm_logits'] is not None else None)  # 打印LM逻辑形状


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

torch.Size([2, 4])
cls_feat: torch.Size([2, 768])
ITC score: torch.Size([2])
ITM logits: torch.Size([2, 2])
LM logits: torch.Size([2, 4, 30522])


# 理解Image Captioning看这个

In [None]:
# 版本
# torch                                    2.7.1
# transformers                             4.53.2

In [2]:
from PIL import Image
import torch
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    BlipForQuestionAnswering, BlipForImageTextRetrieval
)
import requests
from io import BytesIO
import warnings

# 忽略一些不重要的警告
warnings.filterwarnings("ignore", message="Some weights of BlipModel were not initialized")
warnings.filterwarnings("ignore", message="`BlipModel` is going to be deprecated")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def load_demo_image():
    """加载演示图片"""
    # 可以使用本地图片或者网络图片
    try:
        # 尝试加载本地图片
        image = Image.open('demo.jpg').convert('RGB')
        print("Loaded local image: demo.jpg")
    except FileNotFoundError:
        # 如果本地图片不存在，使用网络图片
        img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
        response = requests.get(img_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        print("Loaded image from URL")

    print(f"Image size: {image.size}")
    return image

# =============================================================================
# 1. Image Captioning (图像描述生成)
# =============================================================================
def image_captioning_demo():
    print("\n" + "="*50)
    print("1. IMAGE CAPTIONING")
    print("="*50)

    # 加载图像
    image = load_demo_image()

    # 加载模型和处理器
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    print('-'*100)
    print(model)
    print('-'*100)
    model = model.to(device)

    print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")

    # 处理图像
    inputs = processor(image, return_tensors="pt").to(device)

    # 生成描述
    with torch.no_grad():
        # Beam search生成  top-k生成
        print("\nBeam Search Caption:")
        out = model.generate(**inputs, max_length=50, num_beams=3, early_stopping=True)
        caption_beam = processor.decode(out[0], skip_special_tokens=True)
        print(f"Caption: {caption_beam}")

        # Nucleus sampling生成 top-p生成
        print("\nNucleus Sampling Caption:")
        out = model.generate(**inputs, max_length=50, do_sample=True, top_p=0.9, temperature=0.7)
        caption_sample = processor.decode(out[0], skip_special_tokens=True)
        print(f"Caption: {caption_sample}")

# =============================================================================
# 2. Visual Question Answering (视觉问答)
# =============================================================================
def visual_question_answering_demo():
    print("\n" + "="*50)
    print("2. VISUAL QUESTION ANSWERING")
    print("="*50)

    # 加载图像
    image = load_demo_image()

    # 加载VQA模型
    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
    model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
    model = model.to(device)

    print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")

    # 问题列表
    questions = [
        "where is the woman sitting?",
        "what is the woman doing?",
        "what animal is in the image?",
        "what is the weather like?",
        "what color is the dog?"
    ]

    for question in questions:
        # 处理输入
        inputs = processor(image, question, return_tensors="pt").to(device)

        # 生成答案
        with torch.no_grad():
            out = model.generate(**inputs, max_length=50)
            answer = processor.decode(out[0], skip_special_tokens=True)
            print(f"Q: {question}")
            print(f"A: {answer}\n")

# =============================================================================
# 3. Feature Extraction (特征提取)
# =============================================================================
def feature_extraction_demo():
    print("\n" + "="*50)
    print("3. FEATURE EXTRACTION")
    print("="*50)

    # 加载图像
    image = load_demo_image()

    # 使用BlipForImageTextRetrieval模型进行特征提取，这个模型专门用于图像-文本检索任务
    from transformers import BlipForImageTextRetrieval

    processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
    model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
    model = model.to(device)

    print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")

    text = "a woman sitting on the beach with a dog"

    # 处理输入
    inputs = processor(images=image, text=text, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

        # 获取不同类型的特征
        # 图像特征
        image_embeds = outputs.image_embeds  # [batch_size, embed_dim]
        print(f"Image embeddings shape: {image_embeds.shape}")

        # 文本特征
        text_embeds = outputs.text_embeds    # [batch_size, embed_dim]
        print(f"Text embeddings shape: {text_embeds.shape}")

        # 计算图像-文本相似度
        similarity = torch.cosine_similarity(image_embeds, text_embeds, dim=1)
        print(f"Image-Text cosine similarity: {similarity.item():.4f}")

        # 如果需要更详细的特征，可以单独提取图像和文本特征
        print("\n--- Separate Feature Extraction ---")

        # 仅图像特征
        image_inputs = processor(images=image, return_tensors="pt").to(device)
        image_outputs = model.get_image_features(**image_inputs)
        print(f"Pure image features shape: {image_outputs.shape}")

        # 仅文本特征
        text_inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
        text_outputs = model.get_text_features(**text_inputs)
        print(f"Pure text features shape: {text_outputs.shape}")

        # 计算归一化后的相似度分数
        image_features_norm = image_outputs / image_outputs.norm(dim=1, keepdim=True)
        text_features_norm = text_outputs / text_outputs.norm(dim=1, keepdim=True)
        similarity_score = torch.matmul(image_features_norm, text_features_norm.T).squeeze()
        print(f"Normalized similarity score: {similarity_score.item():.4f}")

        print(f"Feature dimension: {image_outputs.shape[1]}")
        print(f"Image feature norm: {image_outputs.norm(dim=1).item():.4f}")
        print(f"Text feature norm: {text_outputs.norm(dim=1).item():.4f}")

# =============================================================================
# 4. Image-Text Matching (图像-文本匹配)
# =============================================================================
def image_text_matching_demo():
    print("\n" + "="*50)
    print("4. IMAGE-TEXT MATCHING")
    print("="*50)

    # 加载图像
    image = load_demo_image()

    # 使用专门的图像-文本匹配模型
    from transformers import BlipForImageTextRetrieval

    processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
    model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
    model = model.to(device)

    # 测试不同的文本描述
    texts = [
        "a woman sitting on the beach with a dog",  # 匹配的描述
        "a man playing football in the park",       # 不匹配的描述
        "a person relaxing by the water",           # 部分匹配的描述
        "a dog running on the beach",               # 部分匹配的描述
        "a beautiful sunset over the ocean"         # 不匹配的描述
    ]

    print(f"Computing similarity scores for different texts:")
    print("-" * 60)

    for text in texts:
        # 处理输入
        inputs = processor(images=image, text=text, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            # 方法1: 使用ITM分数（Image-Text Matching）
            outputs = model(**inputs)
            itm_score = torch.nn.functional.softmax(outputs.itm_score, dim=1)
            match_probability = itm_score[0, 1].item()  # 索引1表示匹配

            # 方法2: 使用余弦相似度
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds

            # 归一化嵌入
            image_embeds_norm = image_embeds / image_embeds.norm(dim=1, keepdim=True)
            text_embeds_norm = text_embeds / text_embeds.norm(dim=1, keepdim=True)

            # 计算余弦相似度
            cosine_similarity = torch.matmul(image_embeds_norm, text_embeds_norm.T).squeeze().item()

            print(f"Text: '{text}'")
            print(f"ITM Match Probability: {match_probability:.4f}")
            print(f"Cosine Similarity: {cosine_similarity:.4f}")
            print("-" * 60)

# =============================================================================
# 5. 综合演示函数
# =============================================================================
def run_all_demos():
    """运行所有演示"""
    print("BLIP Model Demonstrations using Transformers Library")
    print("=" * 60)

    try:
        # 1. 图像描述生成
        image_captioning_demo()

        # 2. 视觉问答
        visual_question_answering_demo()

        # 3. 特征提取
        # feature_extraction_demo()

        # 4. 图像-文本匹配
        # image_text_matching_demo()

        print("\n" + "="*60)
        print("All demonstrations completed successfully!")
        print("="*60)

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Please make sure you have the required dependencies installed:")
        print("pip install transformers torch torchvision pillow requests")

# =============================================================================
# 主函数
# =============================================================================
if __name__ == "__main__":
    # 检查依赖
    try:
        import transformers
        print(f"Transformers version: {transformers.__version__}")
        print(f"PyTorch version: {torch.__version__}")
        print(f"Device: {device}")
        print("-" * 60)

        # 运行演示
        run_all_demos()

    except ImportError as e:
        print(f"Missing dependency: {e}")
        print("Please install required packages:")
        print("pip install transformers torch torchvision pillow requests")



Using device: cuda
Transformers version: 4.53.2
PyTorch version: 2.6.0+cu124
Device: cuda
------------------------------------------------------------
BLIP Model Demonstrations using Transformers Library

1. IMAGE CAPTIONING
Loaded image from URL
Image size: (2048, 1365)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

----------------------------------------------------------------------------------------------------
BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Model parameters: 384.67M
Q: where is the woman sitting?
A: on beach

Q: what is the woman doing?
A: petting dog

Q: what animal is in the image?
A: dog

Q: what is the weather like?
A: sunny

Q: what color is the dog?
A: tan


All demonstrations completed successfully!


#学习Salesforce/blip-image-captioning-base模型结构
vision_model是ViT

In [None]:
BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (text_decoder): BlipTextLMHeadModel(
    (bert): BlipTextModel(
      (embeddings): BlipTextEmbeddings(
        (word_embeddings): Embedding(30524, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (encoder): BlipTextEncoder(
        (layer): ModuleList(
          (0-11): 12 x BlipTextLayer(
            (attention): BlipTextAttention(
              (self): BlipTextSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): BlipTextSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (crossattention): BlipTextAttention(
              (self): BlipTextSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): BlipTextSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (intermediate): BlipTextIntermediate(
              (dense): Linear(in_features=768, out_features=3072, bias=True)
              (intermediate_act_fn): GELUActivation()
            )
            (output): BlipTextOutput(
              (dense): Linear(in_features=3072, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
        )
      )
    )
    (cls): BlipTextOnlyMLMHead(
      (predictions): BlipTextLMPredictionHead(
        (transform): BlipTextPredictionHeadTransform(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (transform_act_fn): GELUActivation()
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (decoder): Linear(in_features=768, out_features=30524, bias=True)
      )
    )
  )
)