In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvcc --version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install ftfy regex tqdm
!pip install yacs
!pip install torch transformers pytorch-lightning

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-

In [6]:
# Cell 1: Install dependencies
!nvcc --version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install ftfy regex tqdm
!pip install yacs
!pip install torch transformers pytorch-lightning
!pip install --upgrade pytorch-lightning optuna optuna-integration

# Cell 2: Import libraries
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

import os
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from transformers import Blip2Processor, Blip2ForConditionalGeneration, Blip2VisionConfig
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import pandas as pd
from yacs.config import CfgNode
import numpy as np

if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
    torch.set_default_tensor_type(torch.FloatTensor)

# Cell 3: Custom Dataset
class Custom_Dataset(Dataset):
    def __init__(self, cfg, root_folder, dataset, label, split='train', image_size=224, fast=True):
        super(Custom_Dataset, self).__init__()
        self.cfg = cfg
        self.root_folder = root_folder
        self.dataset = dataset
        self.split = split
        self.label = label
        self.image_size = image_size
        self.fast = fast
        self.info_file = cfg.info_file
        self.df = pd.read_csv(self.info_file)
        self.df = self.df[self.df['split'] == self.split].reset_index(drop=True)
        if self.label == 'target':
            self.df = self.df[self.df['hate'] == 1].reset_index(drop=True)
        float_cols = self.df.select_dtypes(float).columns
        self.df[float_cols] = self.df[float_cols].fillna(-1).astype('Int64')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        if row['text'] == 'None':
            text = 'null'
        else:
            text = row['text']
        image_fn = row['name']
        try:
            image = Image.open(f"{self.cfg.img_folder}/{image_fn}").convert('RGB')
            image = image.resize((self.image_size, self.image_size))
        except Exception as e:
            raise ValueError(f"Error loading image {image_fn}: {e}")
        item = {
            'image': image,
            'text': text,
            'label': row[self.label],
            'idx_meme': row['name'],
        }
        return item

# Cell 4: BLIP-2 Collator
class MemeBLIP_Collator:
    def __init__(self, cfg):
        self.cfg = cfg
        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
        ).to(self.cfg.device)
        self.blip_model.eval()

    def split_text_into_chunks(self, text, max_length):
        tokens = self.processor.tokenizer(
            text, return_tensors="pt", truncation=False, add_special_tokens=False
        )["input_ids"].squeeze(0).tolist()
        return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

    def __call__(self, batch):
        image_features_list = []
        text_features_list = []
        labels = torch.LongTensor([item['label'] for item in batch]).to(self.cfg.device)
        idx_memes = [item['idx_meme'] for item in batch]
        batch_new = {'labels': labels, 'idx_memes': idx_memes}

        for item in batch:
            inputs = self.processor(
                images=item['image'],
                text=item['text'] if item['text'] != 'null' else "",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.cfg.device, torch.float16)

            with torch.no_grad():
                image_features = self.blip_model.get_image_features(pixel_values=inputs['pixel_values'])
                text_outputs = self.blip_model.language_model(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    output_hidden_states=True
                )
                text_features = text_outputs.hidden_states[-1].mean(dim=1)

            image_features_list.append(image_features.cpu().detach())
            text_features_list.append(text_features.cpu().detach())

        batch_new['image_features'] = torch.cat(image_features_list, dim=0).to(self.cfg.device)
        batch_new['text_features'] = torch.cat(text_features_list, dim=0).to(self.cfg.device)
        return batch_new

# Cell 5: Data Loading Functions
def load_dataset(cfg, split):
    dataset = Custom_Dataset(
        cfg=cfg,
        root_folder=cfg.root_dir,
        dataset=cfg.dataset_name,
        split=split,
        image_size=cfg.image_size,
        label=cfg.label,
        fast=cfg.fast_process
    )
    return dataset

def create_dataloader(cfg, split="train"):
    dataset = load_dataset(cfg, split)
    collator = MemeBLIP_Collator(cfg)
    generator = torch.Generator(device="cuda") if torch.cuda.is_available() else torch.Generator()
    dataloader = DataLoader(
        dataset,
        batch_size=cfg.batch_size,
        shuffle=(split == "train"),
        generator=generator,
        collate_fn=collator
    )
    return dataloader

# Cell 6: Configuration
cfg = CfgNode()
cfg.root_dir = '/content/drive/MyDrive/CLIPMM'
cfg.img_folder = '/content/drive/MyDrive/CLIPMM/PrideMM/Images'
cfg.info_file = '/content/drive/MyDrive/CLIPMM/PrideMM/PrideMM.csv'
cfg.checkpoint_path = os.path.join(cfg.root_dir, 'checkpoints')
cfg.checkpoint_file = os.path.join(cfg.checkpoint_path, 'model.ckpt')
cfg.clip_variant = "ViT-L/14"
cfg.dataset_name = 'Pride'
cfg.name = 'MemeBLIP'
cfg.label = 'hate'
cfg.seed = 42
cfg.test_only = False
cfg.device = 'cuda'
cfg.gpus = [0]
if cfg.label == 'hate':
    cfg.class_names = ['Benign Meme', 'Harmful Meme']
elif cfg.label == 'humour':
    cfg.class_names = ['No Humour', 'Humour']
elif cfg.label == 'target':
    cfg.class_names = ['No particular target', 'Individual', 'Community', 'Organization']
elif cfg.label == 'stance':
    cfg.class_names = ['Neutral', 'Support', 'Oppose']
cfg.batch_size = 64
cfg.image_size = 224
cfg.num_mapping_layers = 1
cfg.unmapped_dim = 768
cfg.map_dim = 1024
cfg.num_pre_output_layers = 2
cfg.drop_probs = [0.4, 0.2, 0.3]
cfg.dropout_rate = 0.5
cfg.hidden_dim = 1024
cfg.lr = 5e-5
cfg.max_epochs = 50
cfg.weight_decay = 1e-4
cfg.num_classes = len(cfg.class_names)
cfg.scale = 30
cfg.print_model = True
cfg.fast_process = True
cfg.reproduce = False
cfg.ratio = 0.7
cfg.num_layers = 3
cfg.activation = 'ReLU'
cfg.hidden_dim1 = 1024
print(cfg)

# Cell 7: Cached Datasets
class CachedDataset(Dataset):
    def __init__(self, path='/content/drive/MyDrive/CLIPMM/cached_features/train.pt'):
        data = torch.load(path)
        self.image_features = data['image_features']
        self.text_features = data['text_features']
        self.labels = data['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'image_features': self.image_features[idx],
            'text_features': self.text_features[idx],
            'labels': self.labels[idx]
        }

class CachedDataset2(Dataset):
    def __init__(self, path='/content/drive/MyDrive/CLIPMM/cached_features/val.pt'):
        data = torch.load(path)
        self.image_features = data['image_features']
        self.text_features = data['text_features']
        self.labels = data['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'image_features': self.image_features[idx],
            'text_features': self.text_features[idx],
            'labels': self.labels[idx]
        }

train_loader = DataLoader(CachedDataset(), batch_size=cfg.batch_size, shuffle=True, generator=torch.Generator(device='cuda'), num_workers=0, pin_memory=False)
val_loader = DataLoader(CachedDataset2(), batch_size=cfg.batch_size, shuffle=False, generator=torch.Generator(device='cuda'), num_workers=0, pin_memory=False)

# Cell 8: Linear Projection
class LinearProjection(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers, drop_probs):
        super(LinearProjection, self).__init__()
        if isinstance(drop_probs, list):
            dropout_prob = drop_probs[0]
        else:
            dropout_prob = drop_probs
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob)
        )
        self.layers = nn.ModuleList()
        for _ in range(num_layers - 1):
            layer = nn.Sequential(
                nn.Linear(output_dim, output_dim),
                nn.LayerNorm(output_dim),
                nn.ReLU(),
                nn.Dropout(p=dropout_prob)
            )
            self.layers.append(layer)

    def forward(self, x):
        x = self.input_projection(x)
        for layer in self.layers:
            residual = x
            x = layer(x)
            x = x + residual
        return x

# Cell 9: Adapter
class Adapter(nn.Module):
    def __init__(self, c_in, reduction=1.5, dropout_rate=0.1):
        super(Adapter, self).__init__()
        reduced_dim = max(16, int(c_in // reduction))
        self.norm1 = nn.LayerNorm(c_in)
        self.fc = nn.Sequential(
            nn.Linear(c_in, reduced_dim, bias=False),
            nn.GELU(),
            nn.Linear(reduced_dim, c_in, bias=False)
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.norm2 = nn.LayerNorm(c_in)
        self.scale = nn.Parameter(torch.tensor(0.1))
        self.apply(self.init_weights)

    def forward(self, x):
        residual = x
        x = self.fc(self.norm1(x))
        x = self.dropout(x)
        x = residual + self.scale * x
        return self.norm2(x)

    @staticmethod
    def init_weights(m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity='linear')

# Cell 10: Cosine Classifier
class CosineClassifierWithBias(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(output_dim, input_dim))
        self.bias = nn.Parameter(torch.zeros(output_dim))

    def forward(self, x):
        x_norm = F.normalize(x, dim=1)
        w_norm = F.normalize(self.weight, dim=1)
        cosine_sim = torch.matmul(x_norm, w_norm.T)
        return cosine_sim + self.bias

    def apply_weight(self, weight):
        with torch.no_grad():
            self.weight.copy_(weight)

# Cell 11: Main Model
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR

class MemeBLIP(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.image_projection = LinearProjection(
            input_dim=1408,  # BLIP-2 vision output dim
            output_dim=cfg.map_dim,
            num_layers=1,
            drop_probs=cfg.drop_probs
        ).to(self.cfg.device)
        self.text_projection = LinearProjection(
          input_dim=768,  # BLIP-2 Q-Former output dim is 768 (not 2560)
          output_dim=cfg.map_dim,
          num_layers=1,
          drop_probs=cfg.drop_probs
      ).to(self.cfg.device)

        self.image_adapter = Adapter(cfg.map_dim, reduction=2).to(self.cfg.device)
        self.text_adapter = Adapter(cfg.map_dim, reduction=2).to(self.cfg.device)
        self.pre_output_layer = nn.Sequential(
            nn.Linear(cfg.map_dim, cfg.hidden_dim),
            nn.LayerNorm(cfg.hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        )
        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
        ).to(cfg.device)
        self.map_dim = cfg.map_dim
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(p=0.5),
            CosineClassifierWithBias(512, cfg.num_classes)
        )
        self.init_head_text_feat()
        self.cross_entropy_loss = nn.CrossEntropyLoss()
        self.acc = torchmetrics.Accuracy(task="multiclass", num_classes=cfg.num_classes)
        self.auroc = torchmetrics.AUROC(task="multiclass", num_classes=cfg.num_classes)
        self.f1 = torchmetrics.F1Score(task="multiclass", num_classes=cfg.num_classes)
        self.gradients = {}

    def save_gradient(self, name):
        def hook(module, grad_input, grad_output):
            self.gradients[name] = grad_output[0].detach()
        return hook

    def print_gradients(self, modules_to_check, batch_idx):
        for mod_name, module in modules_to_check.items():
            for name, param in module.named_parameters():
                if param.grad is not None:
                    grad_mean = param.grad.abs().mean().item()
                    grad_std = param.grad.std().item()
                    print(f"[Batch {batch_idx}] {mod_name} 梯度 {name}: mean_abs={grad_mean:.8f}, std={grad_std:.8f}")
                else:
                    print(f"[Batch {batch_idx}] {mod_name} 梯度 {name}: 无梯度")

    def register_hooks(self):
        self.image_projection.register_backward_hook(self.save_gradient("image_projection"))
        self.text_projection.register_backward_hook(self.save_gradient("text_projection"))
        self.image_adapter.fc.register_backward_hook(self.save_gradient("image_adapter"))
        self.text_adapter.fc.register_backward_hook(self.save_gradient("text_adapter"))
        self.pre_output_layer.register_backward_hook(self.save_gradient("pre_output_layer"))
        for i, layer in enumerate(self.classifier):
            if isinstance(layer, nn.Linear):
                layer.register_backward_hook(self.save_gradient(f"classifier_{i}"))

    def init_head_text_feat(self):
      print("Initialize head with text features")
      template = "a photo of a {}."
      prompts_list = [template.format(c.replace("_", " ")) for c in self.cfg.class_names]
      tokenized_prompts = self.processor.tokenizer(
          prompts_list, return_tensors="pt", padding=True, truncation=True
      ).to(self.cfg.device)
      prompts = {k: v for k, v in tokenized_prompts.items() if k in ["input_ids", "attention_mask"]}

      # Instantiate a BLIP-2 model that supports get_text_features
      from transformers import Blip2Model
      text_model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
      text_model.to(self.cfg.device)

      with torch.no_grad():
          # Force a tuple output by setting return_dict=False
          text_features = text_model.get_text_features(**prompts, return_dict=False)

      # If text_features is a tuple, take the first element
      if isinstance(text_features, tuple):
          text_features = text_features[0]

      # Average over the sequence dimension to get a single vector per prompt
      text_embeds = text_features.mean(dim=1)
      text_embeds = F.normalize(text_embeds, dim=-1)

      target_dim = 512
      if hasattr(self.classifier[-1], "apply_weight"):
          if text_embeds.size(1) != target_dim:
              projection = nn.Linear(text_embeds.size(1), 512).to(self.cfg.device, torch.float16)

              text_embeds_proj = projection(text_embeds)
              self.classifier[-1].apply_weight(text_embeds_proj)
      else:
          print("Warning: Classifier -1 does not have 'apply_weight' method. Skipping initialization.")


    def forward(self, batch):
        image_features = batch['image_features']
        text_features = batch['text_features']
        if isinstance(image_features, tuple):
            image_features = image_features[0].to(self.cfg.device)
        if isinstance(text_features, tuple):
            text_features = text_features[0].to(self.cfg.device)
        image_proj = self.image_projection(image_features).to(self.cfg.device)
        text_proj = self.text_projection(text_features).to(self.cfg.device)
        adapted_image = self.image_adapter(image_proj).to(self.cfg.device)
        adapted_text = self.text_adapter(text_proj).to(self.cfg.device)
        text_adapted_features = self.cfg.ratio * adapted_text + (1 - self.cfg.ratio) * text_proj
        image_adapted_features = self.cfg.ratio * adapted_image + (1 - self.cfg.ratio) * image_proj
        image_adapted_features = image_adapted_features / image_adapted_features.norm(dim=-1, keepdim=True)
        text_adapted_features = text_adapted_features / text_adapted_features.norm(dim=-1, keepdim=True)
        combined_features = torch.mul(image_adapted_features, text_adapted_features).to(self.cfg.device)
        pre_output_features = self.pre_output_layer(combined_features).to(self.cfg.device)
        logits = self.classifier(pre_output_features).squeeze(dim=1).to(self.cfg.device)
        return logits

    def common_step(self, batch):
        logits = self.forward(batch)
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(preds_proxy, batch['labels'])
        f1 = self.f1(preds, batch["labels"])
        return {"loss": loss, "acc": acc, "auroc": auroc, "f1": f1}

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(preds_proxy, batch['labels'])
        f1 = self.f1(preds, batch["labels"])
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_auroc", auroc, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_f1", f1, on_step=True, on_epoch=True, prog_bar=True)
        if batch_idx > 0 and batch_idx % 20 == 0:
            modules_to_check = {
                "image_projection": self.image_projection,
                "text_projection": self.text_projection,
                "image_adapter": self.image_adapter,
                "text_adapter": self.text_adapter,
                "pre_output_layer": self.pre_output_layer,
                "classifier": self.classifier,
            }
            self.print_gradients(modules_to_check, batch_idx)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW([
            {"params": self.model.parameters(), "weight_decay": 1e-5},
            {"params": list(self.image_projection.parameters()) + list(self.text_projection.parameters()), "weight_decay": 5e-4},
            {"params": list(self.image_adapter.parameters()) + list(self.text_adapter.parameters()), "weight_decay": 1e-3},
            {"params": self.pre_output_layer.parameters(), "weight_decay": 5e-4},
            {"params": self.classifier.parameters(), "weight_decay": 1e-3},
        ], lr=self.cfg.lr)
        warmup_epochs = 3
        total_epochs = self.cfg.max_epochs
        cosine_epochs = total_epochs - warmup_epochs
        scheduler_warmup = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=warmup_epochs)
        scheduler_cosine = CosineAnnealingLR(optimizer, T_max=cosine_epochs, eta_min=1e-6)
        scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_cosine], milestones=[warmup_epochs])
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "epoch"}}

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(torch.softmax(logits, dim=-1), batch["labels"])
        f1 = self.f1(preds, batch["labels"])
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        self.log("val_auroc", auroc, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        return {"loss": loss, "acc": acc, "auroc": auroc, "f1": f1}

    def on_train_epoch_end(self):
        torch.cuda.empty_cache()

# Cell 12: Training Setup
def initialize_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    dirpath=cfg.checkpoint_path,
    filename="memeBLIP-{epoch:02d}-{val_loss:.2f}",
    save_top_k=1,
    monitor="val_loss",
    mode="min"
)

early_stop_callback = EarlyStopping(
    monitor="val_loss",
    patience=10,
    verbose=True,
    mode="min"
)

model = MemeBLIP(cfg)
model.register_hooks()
model.apply(initialize_weights)
model.to(cfg.device)

trainer = pl.Trainer(
    max_epochs=cfg.max_epochs,
    accelerator="gpu",
    precision=16,
    gradient_clip_val=1.0,
    gradient_clip_algorithm='norm',
    devices=len(cfg.gpus),
    logger=pl.loggers.TensorBoardLogger("logs/"),
    callbacks=[early_stop_callback, checkpoint_callback]
)

# Cell 13: Training and Validation
trainer.fit(model, train_loader, val_loader)
validation_metrics = trainer.validate(model, val_loader, verbose=True)
print("Validation Metrics:", validation_metrics)
print("Validation Accuracy:", trainer.callback_metrics["val_acc"])
print("Validation AUROC:", trainer.callback_metrics["val_auroc"])
print("Validation F1 Score:", trainer.callback_metrics["val_f1"])

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Looking in indexes: https://download.pytorch.org/whl/cu121
CUDA Available: True
CUDA Device Count: 1
Current Device: 0
Device Name: NVIDIA A100-SXM4-40GB
activation: ReLU
batch_size: 64
checkpoint_file: /content/drive/MyDrive/CLIPMM/checkpoints/model.ckpt
checkpoint_path: /content/drive/MyDrive/CLIPMM/checkpoints
class_names: ['Benign Meme', 'Harmful Meme']
clip_variant: ViT-L/14
dataset_name: Pride
device: cuda
drop_probs: [0.4, 0.2, 0.3]
dropout_rate: 0.5
fast_process: True
gpus: [0]
hidden_dim: 1024
hidden_dim1: 1024
image_size: 224
img_folder: /content/drive/MyDrive/CLIPMM/PrideMM/Images
info_file: /content/drive/MyDrive/CLIPMM/PrideMM/PrideMM.csv
label: hate
lr: 5e-05
map_dim: 1024
max_epochs: 50
name: MemeBLIP
num_classes: 2
num_layers: 3
num_mapping_layers: 1
num_pre_output_

  data = torch.load(path)
  data = torch.load(path)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Initialize head with text features


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
   | Name               | Type                          | Params | Mode 
------------------------------------------------------------------------------
0  | image_projection   | LinearProjection              | 1.4 M  | train
1  | text_projection    | LinearProjection              | 789 K  | train
2  | image_adapter      | Adapter                       | 1.1 M  | train
3  | text_adapter       | Adapter                       | 1.1 M  | train
4  | pre_output_layer   | Sequential                    | 1.1 M  | train
5  | model          

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000868, std=0.00001699
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000909, std=0.00001460
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000942, std=0.00001669
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001153, std=0.00001853
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001078, std=0.00001880
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00002936, std=0.00004637
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000927, std=0.00001662
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001065, std=0.00001686
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00014761, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000071, std=0.00000100
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000083, std=0.00000104
[Batch 20] image_adapter 梯度 fc.0.weight: m

  grad_std = param.grad.std().item()


[Batch 40] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000911, std=0.00001794
[Batch 40] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000935, std=0.00001510
[Batch 40] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000934, std=0.00001725
[Batch 40] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001179, std=0.00001908
[Batch 40] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001115, std=0.00002004
[Batch 40] text_projection 梯度 input_projection.0.bias: mean_abs=0.00002974, std=0.00004920
[Batch 40] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000896, std=0.00001711
[Batch 40] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001095, std=0.00001818
[Batch 40] image_adapter 梯度 scale: mean_abs=0.00006487, std=nan
[Batch 40] image_adapter 梯度 norm1.weight: mean_abs=0.00000076, std=0.00000108
[Batch 40] image_adapter 梯度 norm1.bias: mean_abs=0.00000092, std=0.00000114
[Batch 40] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 0.693


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000861, std=0.00001730
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000880, std=0.00001482
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000947, std=0.00001731
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001112, std=0.00001874
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001119, std=0.00001973
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00003052, std=0.00004885
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000910, std=0.00001705
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001121, std=0.00001804
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00027748, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000070, std=0.00000101
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000079, std=0.00000100
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.690


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00001061, std=0.00002223
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00001092, std=0.00001951
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00001171, std=0.00002549
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001379, std=0.00002466
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001318, std=0.00002449
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00003598, std=0.00006186
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001112, std=0.00002497
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001322, std=0.00002279
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00039411, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000090, std=0.00000131
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000103, std=0.00000128
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.016 >= min_delta = 0.0. New best score: 0.674


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00001695, std=0.00004162
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00001660, std=0.00003122
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00001887, std=0.00004109
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00002099, std=0.00003958
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001994, std=0.00003606
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005367, std=0.00008760
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001785, std=0.00003984
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002033, std=0.00003315
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00024377, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000180, std=0.00000253
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000195, std=0.00000244
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.052 >= min_delta = 0.0. New best score: 0.623


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00002280, std=0.00006824
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00002379, std=0.00005956
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00002400, std=0.00006202
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00002940, std=0.00007412
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002159, std=0.00004298
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005824, std=0.00010456
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002031, std=0.00005542
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002316, std=0.00004170
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00448478, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000380, std=0.00000565
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000349, std=0.00000433
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.029 >= min_delta = 0.0. New best score: 0.593


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00002605, std=0.00007642
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00002692, std=0.00006857
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00002827, std=0.00006758
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00003312, std=0.00008513
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001962, std=0.00003992
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005400, std=0.00009859
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002045, std=0.00005699
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002270, std=0.00004161
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01122883, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000694, std=0.00001149
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000708, std=0.00000889
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.590


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00003103, std=0.00009048
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00002949, std=0.00007479
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00003272, std=0.00007368
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00003617, std=0.00009247
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002059, std=0.00004202
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005613, std=0.00010048
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002194, std=0.00005393
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002470, std=0.00004425
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01465031, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000928, std=0.00001491
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000752, std=0.00000952
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00002777, std=0.00007175
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00003469, std=0.00006923
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00003604, std=0.00007719
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004147, std=0.00008267
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001526, std=0.00003108
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00004370, std=0.00007687
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001686, std=0.00003996
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002062, std=0.00003645
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00590509, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001106, std=0.00001926
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00001908, std=0.00002385
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004281, std=0.00009540
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00003878, std=0.00006754
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004328, std=0.00008577
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004668, std=0.00008144
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002516, std=0.00005760
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00007184, std=0.00014749
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002520, std=0.00006977
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003295, std=0.00006738
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00966162, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002002, std=0.00003501
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000840, std=0.00001061
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00003487, std=0.00008356
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00004360, std=0.00008057
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004163, std=0.00008287
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00005193, std=0.00009587
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001620, std=0.00003258
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00004602, std=0.00008144
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001678, std=0.00003621
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002207, std=0.00003893
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01252119, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001764, std=0.00003207
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003141, std=0.00003946
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00003873, std=0.00009394
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00004931, std=0.00009098
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004493, std=0.00008817
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00005636, std=0.00010421
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001715, std=0.00003872
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005129, std=0.00010138
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001810, std=0.00004402
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002442, std=0.00004848
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01268629, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001873, std=0.00003078
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003779, std=0.00004701
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00005209, std=0.00011487
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00005390, std=0.00009686
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005525, std=0.00010727
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00006205, std=0.00011201
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002288, std=0.00004630
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006410, std=0.00011295
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002338, std=0.00005472
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003008, std=0.00005317
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00413854, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002697, std=0.00004601
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00002624, std=0.00003227
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004895, std=0.00010308
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00005777, std=0.00009258
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005178, std=0.00009291
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00006568, std=0.00010509
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002085, std=0.00004395
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005758, std=0.00010554
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002024, std=0.00004640
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002701, std=0.00004990
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00887285, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002667, std=0.00004248
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00004383, std=0.00005389
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00006480, std=0.00013234
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00006155, std=0.00010531
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005900, std=0.00010143
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00007122, std=0.00012107
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002830, std=0.00005701
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00007797, std=0.00013903
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002617, std=0.00005934
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003638, std=0.00006488
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00946456, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00003313, std=0.00005235
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00001376, std=0.00001739
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.583


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00007063, std=0.00015077
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007067, std=0.00012462
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006646, std=0.00011682
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008083, std=0.00014157
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00003087, std=0.00006585
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00008955, std=0.00017019
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002937, std=0.00006464
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003952, std=0.00007583
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01381269, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00003820, std=0.00006504
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00001277, std=0.00001609
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00007012, std=0.00013599
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00006995, std=0.00010830
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006817, std=0.00011405
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008099, std=0.00012525
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002791, std=0.00005403
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00007571, std=0.00012971
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002718, std=0.00005691
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003357, std=0.00005777
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00046120, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004076, std=0.00006332
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003934, std=0.00004849
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00007559, std=0.00015885
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007770, std=0.00013547
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006742, std=0.00011649
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008874, std=0.00015560
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002895, std=0.00005916
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00008135, std=0.00014805
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002800, std=0.00007485
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003577, std=0.00006522
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00594516, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004437, std=0.00006977
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00005526, std=0.00006752
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00008838, std=0.00018773
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00009013, std=0.00014837
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00007831, std=0.00013121
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00010223, std=0.00016797
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002788, std=0.00006137
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00007885, std=0.00015200
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002756, std=0.00006410
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003481, std=0.00006766
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01241729, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00005228, std=0.00008391
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003660, std=0.00004513
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00006372, std=0.00014836
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007635, std=0.00014872
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006668, std=0.00011124
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008622, std=0.00016968
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002293, std=0.00004666
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006469, std=0.00011345
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002310, std=0.00005574
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002877, std=0.00005048
[Batch 20] image_adapter 梯度 scale: mean_abs=0.02630397, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004072, std=0.00005837
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00007891, std=0.00009538
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00006455, std=0.00014367
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007714, std=0.00013109
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006140, std=0.00010231
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008814, std=0.00014952
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002170, std=0.00004750
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006494, std=0.00012253
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001996, std=0.00004516
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002792, std=0.00005318
[Batch 20] image_adapter 梯度 scale: mean_abs=0.03198014, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004270, std=0.00005940
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00008245, std=0.00009912
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00005401, std=0.00013127
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007150, std=0.00011995
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005737, std=0.00010126
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008146, std=0.00013640
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001725, std=0.00003950
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005150, std=0.00009773
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001727, std=0.00004269
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002237, std=0.00004264
[Batch 20] image_adapter 梯度 scale: mean_abs=0.03179222, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004466, std=0.00006273
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00010062, std=0.00012120
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00008765, std=0.00017603
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00009169, std=0.00014788
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00008508, std=0.00014778
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00010718, std=0.00017291
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002999, std=0.00006461
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00008513, std=0.00015691
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002942, std=0.00007564
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003781, std=0.00007053
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01756568, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00006004, std=0.00008863
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00007450, std=0.00009020
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00006172, std=0.00015527
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007606, std=0.00013794
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005890, std=0.00012167
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008568, std=0.00015587
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002063, std=0.00005820
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006329, std=0.00014801
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001798, std=0.00004841
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002665, std=0.00006299
[Batch 20] image_adapter 梯度 scale: mean_abs=0.02527150, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00005466, std=0.00008443
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00010457, std=0.00012658
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00006336, std=0.00017669
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007658, std=0.00015607
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005704, std=0.00010850
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008809, std=0.00018064
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001925, std=0.00004905
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005992, std=0.00012641
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001772, std=0.00004624
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002517, std=0.00005355
[Batch 20] image_adapter 梯度 scale: mean_abs=0.02422483, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00005251, std=0.00007457
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00010164, std=0.00012323
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_loss did not improve in the last 10 records. Best score: 0.583. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

Validation Metrics: [{'val_loss': 0.619696855545044, 'val_acc': 0.7149122953414917, 'val_auroc': 0.7724952697753906, 'val_f1': 0.7149122953414917}]
Validation Accuracy: tensor(0.7149, device='cpu')
Validation AUROC: tensor(0.7725, device='cpu')
Validation F1 Score: tensor(0.7149, device='cpu')
