In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvcc --version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install ftfy regex tqdm
!pip install yacs
!pip install torch transformers pytorch-lightning

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-

In [3]:
# Cell 1: Install dependencies
!nvcc --version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install ftfy regex tqdm
!pip install yacs
!pip install torch transformers pytorch-lightning
!pip install --upgrade pytorch-lightning optuna optuna-integration

# Cell 2: Import libraries
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

import os
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from transformers import Blip2Processor, Blip2ForConditionalGeneration, Blip2VisionConfig
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import pandas as pd
from yacs.config import CfgNode
import numpy as np
import json  # For reading JSONL files

if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
    torch.set_default_tensor_type(torch.FloatTensor)

# Cell 3: Custom Dataset
class Custom_Dataset(Dataset):
    def __init__(self, cfg, root_folder, dataset, label, split='train', image_size=224, fast=True):
        super(Custom_Dataset, self).__init__()
        self.cfg = cfg
        self.root_folder = root_folder
        self.dataset = dataset
        self.split = split
        self.label = label
        self.image_size = image_size
        self.fast = fast

        # If JSONL annotation files are provided in the config, use them.
        if hasattr(cfg, "train_file"):
            if split == 'train':
                ann_file = cfg.train_file
            elif split == 'val':
                ann_file = cfg.val_file
            elif split == 'test':
                ann_file = cfg.test_file
            else:
                raise ValueError(f"Unknown split: {split}")

            self.data = []
            with open(ann_file, "r") as f:
                for line in f:
                    obj = json.loads(line)
                    self.data.append(obj)
        else:
            # Fallback: load from CSV
            self.info_file = cfg.info_file
            self.df = pd.read_csv(self.info_file)
            self.df = self.df[self.df["split"] == self.split].reset_index(drop=True)
            if self.label == "target":
                self.df = self.df[self.df["hate"] == 1].reset_index(drop=True)
            float_cols = self.df.select_dtypes(float).columns
            self.df[float_cols] = self.df[float_cols].fillna(-1).astype("Int64")

    def __len__(self):
        if hasattr(self.cfg, "train_file"):
            return len(self.data)
        else:
            return len(self.df)

    def __getitem__(self, idx):
        if hasattr(self.cfg, "train_file"):
            obj = self.data[idx]
            text = obj.get("text", "")
            image_name = obj.get("image", "")
            image_path = os.path.join(self.cfg.img_folder, image_name)
            try:
                image = Image.open(image_path).convert("RGB")
                image = image.resize((self.image_size, self.image_size))
            except Exception as e:
                raise ValueError(f"Error loading image {image_name}: {e}")
            # For the 'hate' task, we map "not harmful" to label 0; all others to 1.
            labels_list = obj.get("labels", [])
            label_str = labels_list[0] if labels_list else "unknown"
            if self.label == "hate":
                label_idx = 0 if label_str.lower() == "not harmful" else 1
            else:
                label_idx = 0  # default mapping if not specified
            item = {
                "image": image,
                "text": text,
                "label": label_idx,
                "idx_meme": obj.get("id", image_name),
            }
            return item
        else:
            row = self.df.iloc[idx]
            text = row["text"] if row["text"] != "None" else "null"
            image_fn = row["name"]
            try:
                image = Image.open(f"{self.cfg.img_folder}/{image_fn}").convert("RGB")
                image = image.resize((self.image_size, self.image_size))
            except Exception as e:
                raise ValueError(f"Error loading image {image_fn}: {e}")
            item = {
                "image": image,
                "text": text,
                "label": row[self.label],
                "idx_meme": row["name"],
            }
            return item

# Cell 4: BLIP-2 Collator
class MemeBLIP_Collator:
    def __init__(self, cfg):
        self.cfg = cfg
        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
        ).to(self.cfg.device)
        self.blip_model.eval()

    def split_text_into_chunks(self, text, max_length):
        tokens = self.processor.tokenizer(
            text, return_tensors="pt", truncation=False, add_special_tokens=False
        )["input_ids"].squeeze(0).tolist()
        return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

    def __call__(self, batch):
        image_features_list = []
        text_features_list = []
        labels = torch.LongTensor([item["label"] for item in batch]).to(self.cfg.device)
        idx_memes = [item["idx_meme"] for item in batch]
        batch_new = {"labels": labels, "idx_memes": idx_memes}

        for item in batch:
            inputs = self.processor(
                images=item["image"],
                text=item["text"] if item["text"] != "null" else "",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
            ).to(self.cfg.device, torch.float16)

            with torch.no_grad():
                image_features = self.blip_model.get_image_features(pixel_values=inputs["pixel_values"])
                text_outputs = self.blip_model.language_model(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    output_hidden_states=True,
                )
                text_features = text_outputs.hidden_states[-1].mean(dim=1)

            image_features_list.append(image_features.cpu().detach())
            text_features_list.append(text_features.cpu().detach())

        batch_new["image_features"] = torch.cat(image_features_list, dim=0).to(self.cfg.device)
        batch_new["text_features"] = torch.cat(text_features_list, dim=0).to(self.cfg.device)
        return batch_new

# Cell 5: Data Loading Functions
def load_dataset(cfg, split):
    dataset = Custom_Dataset(
        cfg=cfg,
        root_folder=cfg.root_dir,
        dataset=cfg.dataset_name,
        label=cfg.label,
        split=split,
        image_size=cfg.image_size,
        fast=cfg.fast_process,
    )
    return dataset

def create_dataloader(cfg, split="train"):
    dataset = load_dataset(cfg, split)
    collator = MemeBLIP_Collator(cfg)
    generator = torch.Generator(device="cuda") if torch.cuda.is_available() else torch.Generator()
    dataloader = DataLoader(
        dataset,
        batch_size=cfg.batch_size,
        shuffle=(split == "train"),
        generator=generator,
        collate_fn=collator,
    )
    return dataloader

# Cell 6: Configuration
cfg = CfgNode()
cfg.root_dir = '/content/drive/MyDrive/CLIPMM'
cfg.img_folder = '/content/drive/MyDrive/CLIPMM/memeData/images'
cfg.train_file = '/content/drive/MyDrive/CLIPMM/memeData/annotations/train.jsonl'
cfg.val_file   = '/content/drive/MyDrive/CLIPMM/memeData/annotations/val.jsonl'
cfg.test_file  = '/content/drive/MyDrive/CLIPMM/memeData/annotations/test.jsonl'
cfg.checkpoint_path = os.path.join(cfg.root_dir, 'checkpoints')
cfg.checkpoint_file = os.path.join(cfg.checkpoint_path, 'model.ckpt')
cfg.clip_variant = "ViT-L/14"
cfg.dataset_name = 'Pride'
cfg.name = 'MemeBLIP'
cfg.label = 'hate'
cfg.seed = 42
cfg.test_only = False
cfg.device = 'cuda'
cfg.gpus = [0]
if cfg.label == 'hate':
    cfg.class_names = ['Benign Meme', 'Harmful Meme']
elif cfg.label == 'humour':
    cfg.class_names = ['No Humour', 'Humour']
elif cfg.label == 'target':
    cfg.class_names = ['No particular target', 'Individual', 'Community', 'Organization']
elif cfg.label == 'stance':
    cfg.class_names = ['Neutral', 'Support', 'Oppose']
cfg.batch_size = 64
cfg.image_size = 224
cfg.num_mapping_layers = 1
cfg.unmapped_dim = 768
cfg.map_dim = 1024
cfg.num_pre_output_layers = 2
cfg.drop_probs = [0.4, 0.2, 0.3]
cfg.dropout_rate = 0.5
cfg.hidden_dim = 1024
cfg.lr = 5e-5
cfg.max_epochs = 50
cfg.weight_decay = 1e-4
cfg.num_classes = len(cfg.class_names)
cfg.scale = 30
cfg.print_model = True
cfg.fast_process = True
cfg.reproduce = False
cfg.ratio = 0.7
cfg.num_layers = 3
cfg.activation = 'ReLU'
cfg.hidden_dim1 = 1024
print(cfg)

# Cell 7: Cached Datasets
class CachedDataset(Dataset):
    def __init__(self, path='/content/drive/MyDrive/CLIPMM/cached_features/train.pt'):
        data = torch.load(path)
        self.image_features = data['image_features']
        self.text_features = data['text_features']
        self.labels = data['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'image_features': self.image_features[idx],
            'text_features': self.text_features[idx],
            'labels': self.labels[idx]
        }

class CachedDataset2(Dataset):
    def __init__(self, path='/content/drive/MyDrive/CLIPMM/cached_features/val.pt'):
        data = torch.load(path)
        self.image_features = data['image_features']
        self.text_features = data['text_features']
        self.labels = data['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'image_features': self.image_features[idx],
            'text_features': self.text_features[idx],
            'labels': self.labels[idx]
        }

train_loader = DataLoader(CachedDataset(), batch_size=cfg.batch_size, shuffle=True, generator=torch.Generator(device='cuda'), num_workers=0, pin_memory=False)
val_loader = DataLoader(CachedDataset2(), batch_size=cfg.batch_size, shuffle=False, generator=torch.Generator(device='cuda'), num_workers=0, pin_memory=False)

# Cell 8: Linear Projection
class LinearProjection(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers, drop_probs):
        super(LinearProjection, self).__init__()
        if isinstance(drop_probs, list):
            dropout_prob = drop_probs[0]
        else:
            dropout_prob = drop_probs
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob)
        )
        self.layers = nn.ModuleList()
        for _ in range(num_layers - 1):
            layer = nn.Sequential(
                nn.Linear(output_dim, output_dim),
                nn.LayerNorm(output_dim),
                nn.ReLU(),
                nn.Dropout(p=dropout_prob)
            )
            self.layers.append(layer)

    def forward(self, x):
        x = self.input_projection(x)
        for layer in self.layers:
            residual = x
            x = layer(x)
            x = x + residual
        return x

# Cell 9: Adapter
class Adapter(nn.Module):
    def __init__(self, c_in, reduction=1.5, dropout_rate=0.1):
        super(Adapter, self).__init__()
        reduced_dim = max(16, int(c_in // reduction))
        self.norm1 = nn.LayerNorm(c_in)
        self.fc = nn.Sequential(
            nn.Linear(c_in, reduced_dim, bias=False),
            nn.GELU(),
            nn.Linear(reduced_dim, c_in, bias=False)
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.norm2 = nn.LayerNorm(c_in)
        self.scale = nn.Parameter(torch.tensor(0.1))
        self.apply(self.init_weights)

    def forward(self, x):
        residual = x
        x = self.fc(self.norm1(x))
        x = self.dropout(x)
        x = residual + self.scale * x
        return self.norm2(x)

    @staticmethod
    def init_weights(m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity='linear')

# Cell 10: Cosine Classifier
class CosineClassifierWithBias(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(output_dim, input_dim))
        self.bias = nn.Parameter(torch.zeros(output_dim))

    def forward(self, x):
        x_norm = F.normalize(x, dim=1)
        w_norm = F.normalize(self.weight, dim=1)
        cosine_sim = torch.matmul(x_norm, w_norm.T)
        return cosine_sim + self.bias

    def apply_weight(self, weight):
        with torch.no_grad():
            self.weight.copy_(weight)

# Cell 11: Main Model
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR

class MemeBLIP(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.image_projection = LinearProjection(
            input_dim=1408,  # BLIP-2 vision output dim
            output_dim=cfg.map_dim,
            num_layers=1,
            drop_probs=cfg.drop_probs
        ).to(self.cfg.device)
        self.text_projection = LinearProjection(
            input_dim=768,  # BLIP-2 Q-Former output dim is 768
            output_dim=cfg.map_dim,
            num_layers=1,
            drop_probs=cfg.drop_probs
        ).to(self.cfg.device)

        self.image_adapter = Adapter(cfg.map_dim, reduction=2).to(self.cfg.device)
        self.text_adapter = Adapter(cfg.map_dim, reduction=2).to(self.cfg.device)
        self.pre_output_layer = nn.Sequential(
            nn.Linear(cfg.map_dim, cfg.hidden_dim),
            nn.LayerNorm(cfg.hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        )
        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
        ).to(cfg.device)
        self.map_dim = cfg.map_dim
        self.classifier = nn.Sequential(
            nn.Linear(cfg.hidden_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(p=0.5),
            CosineClassifierWithBias(512, cfg.num_classes)
        )
        self.init_head_text_feat()
        self.cross_entropy_loss = nn.CrossEntropyLoss()
        self.acc = torchmetrics.Accuracy(task="multiclass", num_classes=cfg.num_classes)
        self.auroc = torchmetrics.AUROC(task="multiclass", num_classes=cfg.num_classes)
        self.f1 = torchmetrics.F1Score(task="multiclass", num_classes=cfg.num_classes)
        self.gradients = {}

    def save_gradient(self, name):
        def hook(module, grad_input, grad_output):
            self.gradients[name] = grad_output[0].detach()
        return hook

    def print_gradients(self, modules_to_check, batch_idx):
        for mod_name, module in modules_to_check.items():
            for name, param in module.named_parameters():
                if param.grad is not None:
                    grad_mean = param.grad.abs().mean().item()
                    grad_std = param.grad.std().item()
                    print(f"[Batch {batch_idx}] {mod_name} 梯度 {name}: mean_abs={grad_mean:.8f}, std={grad_std:.8f}")
                else:
                    print(f"[Batch {batch_idx}] {mod_name} 梯度 {name}: 无梯度")

    def register_hooks(self):
        self.image_projection.register_backward_hook(self.save_gradient("image_projection"))
        self.text_projection.register_backward_hook(self.save_gradient("text_projection"))
        self.image_adapter.fc.register_backward_hook(self.save_gradient("image_adapter"))
        self.text_adapter.fc.register_backward_hook(self.save_gradient("text_adapter"))
        self.pre_output_layer.register_backward_hook(self.save_gradient("pre_output_layer"))
        for i, layer in enumerate(self.classifier):
            if isinstance(layer, nn.Linear):
                layer.register_backward_hook(self.save_gradient(f"classifier_{i}"))

    def init_head_text_feat(self):
        print("Initialize head with text features")
        template = "a photo of a {}."
        prompts_list = [template.format(c.replace("_", " ")) for c in self.cfg.class_names]
        tokenized_prompts = self.processor.tokenizer(
            prompts_list, return_tensors="pt", padding=True, truncation=True
        ).to(self.cfg.device)
        prompts = {k: v for k, v in tokenized_prompts.items() if k in ["input_ids", "attention_mask"]}

        from transformers import Blip2Model
        text_model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
        text_model.to(self.cfg.device)

        with torch.no_grad():
            text_features = text_model.get_text_features(**prompts, return_dict=False)
        if isinstance(text_features, tuple):
            text_features = text_features[0]
        text_embeds = text_features.mean(dim=1)
        text_embeds = F.normalize(text_embeds, dim=-1)

        target_dim = 512
        if hasattr(self.classifier[-1], "apply_weight"):
            if text_embeds.size(1) != target_dim:
                projection = nn.Linear(text_embeds.size(1), 512).to(self.cfg.device, torch.float16)
                text_embeds_proj = projection(text_embeds)
                self.classifier[-1].apply_weight(text_embeds_proj)
        else:
            print("Warning: Classifier -1 does not have 'apply_weight' method. Skipping initialization.")

    def forward(self, batch):
        image_features = batch['image_features']
        text_features = batch['text_features']
        if isinstance(image_features, tuple):
            image_features = image_features[0].to(self.cfg.device)
        if isinstance(text_features, tuple):
            text_features = text_features[0].to(self.cfg.device)
        image_proj = self.image_projection(image_features).to(self.cfg.device)
        text_proj = self.text_projection(text_features).to(self.cfg.device)
        adapted_image = self.image_adapter(image_proj).to(self.cfg.device)
        adapted_text = self.text_adapter(text_proj).to(self.cfg.device)
        text_adapted_features = self.cfg.ratio * adapted_text + (1 - self.cfg.ratio) * text_proj
        image_adapted_features = self.cfg.ratio * adapted_image + (1 - self.cfg.ratio) * image_proj
        image_adapted_features = image_adapted_features / image_adapted_features.norm(dim=-1, keepdim=True)
        text_adapted_features = text_adapted_features / text_adapted_features.norm(dim=-1, keepdim=True)
        combined_features = torch.mul(image_adapted_features, text_adapted_features).to(self.cfg.device)
        pre_output_features = self.pre_output_layer(combined_features).to(self.cfg.device)
        logits = self.classifier(pre_output_features).squeeze(dim=1).to(self.cfg.device)
        return logits

    def common_step(self, batch):
        logits = self.forward(batch)
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(preds_proxy, batch['labels'])
        f1 = self.f1(preds, batch["labels"])
        return {"loss": loss, "acc": acc, "auroc": auroc, "f1": f1}

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(preds_proxy, batch['labels'])
        f1 = self.f1(preds, batch["labels"])
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_auroc", auroc, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_f1", f1, on_step=True, on_epoch=True, prog_bar=True)
        if batch_idx > 0 and batch_idx % 20 == 0:
            modules_to_check = {
                "image_projection": self.image_projection,
                "text_projection": self.text_projection,
                "image_adapter": self.image_adapter,
                "text_adapter": self.text_adapter,
                "pre_output_layer": self.pre_output_layer,
                "classifier": self.classifier,
            }
            self.print_gradients(modules_to_check, batch_idx)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW([
            {"params": self.model.parameters(), "weight_decay": 1e-5},
            {"params": list(self.image_projection.parameters()) + list(self.text_projection.parameters()), "weight_decay": 5e-4},
            {"params": list(self.image_adapter.parameters()) + list(self.text_adapter.parameters()), "weight_decay": 1e-3},
            {"params": self.pre_output_layer.parameters(), "weight_decay": 5e-4},
            {"params": self.classifier.parameters(), "weight_decay": 1e-3},
        ], lr=self.cfg.lr)
        warmup_epochs = 3
        total_epochs = self.cfg.max_epochs
        cosine_epochs = total_epochs - warmup_epochs
        scheduler_warmup = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=warmup_epochs)
        scheduler_cosine = CosineAnnealingLR(optimizer, T_max=cosine_epochs, eta_min=1e-6)
        scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_cosine], milestones=[warmup_epochs])
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "epoch"}}

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch)
        loss = self.cross_entropy_loss(logits, batch["labels"])
        preds_proxy = torch.sigmoid(logits)
        _, preds = logits.data.max(1)
        acc = self.acc(preds, batch["labels"])
        auroc = self.auroc(torch.softmax(logits, dim=-1), batch["labels"])
        f1 = self.f1(preds, batch["labels"])
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        self.log("val_auroc", auroc, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        return {"loss": loss, "acc": acc, "auroc": auroc, "f1": f1}

    def on_train_epoch_end(self):
        torch.cuda.empty_cache()

# Cell 12: Training Setup
def initialize_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    dirpath=cfg.checkpoint_path,
    filename="memeBLIP-{epoch:02d}-{val_loss:.2f}",
    save_top_k=1,
    monitor="val_loss",
    mode="min"
)

early_stop_callback = EarlyStopping(
    monitor="val_loss",
    patience=10,
    verbose=True,
    mode="min"
)

model = MemeBLIP(cfg)
model.register_hooks()
model.apply(initialize_weights)
model.to(cfg.device)

trainer = pl.Trainer(
    max_epochs=cfg.max_epochs,
    accelerator="gpu",
    precision=16,
    gradient_clip_val=1.0,
    gradient_clip_algorithm='norm',
    devices=len(cfg.gpus),
    logger=pl.loggers.TensorBoardLogger("logs/"),
    callbacks=[early_stop_callback, checkpoint_callback]
)

# Cell 13: Training and Validation
trainer.fit(model, train_loader, val_loader)
validation_metrics = trainer.validate(model, val_loader, verbose=True)
print("Validation Metrics:", validation_metrics)
print("Validation Accuracy:", trainer.callback_metrics["val_acc"])
print("Validation AUROC:", trainer.callback_metrics["val_auroc"])
print("Validation F1 Score:", trainer.callback_metrics["val_f1"])


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting optuna-integration
  Downloading optuna_integration-4.2.1-py3-none-any.whl.metadata (12 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna_integration-4.2.1-py3-none-any.whl (97 kB)
[2K

  _C._set_default_tensor_type(t)
  data = torch.load(path)


activation: ReLU
batch_size: 64
checkpoint_file: /content/drive/MyDrive/CLIPMM/checkpoints/model.ckpt
checkpoint_path: /content/drive/MyDrive/CLIPMM/checkpoints
class_names: ['Benign Meme', 'Harmful Meme']
clip_variant: ViT-L/14
dataset_name: Pride
device: cuda
drop_probs: [0.4, 0.2, 0.3]
dropout_rate: 0.5
fast_process: True
gpus: [0]
hidden_dim: 1024
hidden_dim1: 1024
image_size: 224
img_folder: /content/drive/MyDrive/CLIPMM/memeData/images
label: hate
lr: 5e-05
map_dim: 1024
max_epochs: 50
name: MemeBLIP
num_classes: 2
num_layers: 3
num_mapping_layers: 1
num_pre_output_layers: 2
print_model: True
ratio: 0.7
reproduce: False
root_dir: /content/drive/MyDrive/CLIPMM
scale: 30
seed: 42
test_file: /content/drive/MyDrive/CLIPMM/memeData/annotations/test.jsonl
test_only: False
train_file: /content/drive/MyDrive/CLIPMM/memeData/annotations/train.jsonl
unmapped_dim: 768
val_file: /content/drive/MyDrive/CLIPMM/memeData/annotations/val.jsonl
weight_decay: 0.0001


  data = torch.load(path)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Initialize head with text features


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUD

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000889, std=0.00001776
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000926, std=0.00001530
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000946, std=0.00001769
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001154, std=0.00001902
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001060, std=0.00001882
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00002882, std=0.00004634
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000914, std=0.00001710
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001083, std=0.00001748
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00001375, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000073, std=0.00000102
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000082, std=0.00000102
[Batch 20] image_adapter 梯度 fc.0.weight: m

  grad_std = param.grad.std().item()


[Batch 40] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000930, std=0.00001836
[Batch 40] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000947, std=0.00001575
[Batch 40] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000995, std=0.00001762
[Batch 40] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001177, std=0.00001954
[Batch 40] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001106, std=0.00001940
[Batch 40] text_projection 梯度 input_projection.0.bias: mean_abs=0.00002956, std=0.00004695
[Batch 40] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000921, std=0.00001728
[Batch 40] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001125, std=0.00001790
[Batch 40] image_adapter 梯度 scale: mean_abs=0.00039578, std=nan
[Batch 40] image_adapter 梯度 norm1.weight: mean_abs=0.00000076, std=0.00000105
[Batch 40] image_adapter 梯度 norm1.bias: mean_abs=0.00000095, std=0.00000120
[Batch 40] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 0.692


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00000906, std=0.00001733
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00000907, std=0.00001458
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00000917, std=0.00001626
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001128, std=0.00001814
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001158, std=0.00002023
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00003179, std=0.00005006
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00000947, std=0.00001702
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001206, std=0.00001901
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00025223, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000071, std=0.00000101
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000081, std=0.00000102
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.689


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00001044, std=0.00002055
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00001069, std=0.00001706
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00001148, std=0.00002037
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00001340, std=0.00002139
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001324, std=0.00002414
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00003639, std=0.00006036
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001100, std=0.00002055
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00001380, std=0.00002300
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00017481, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000090, std=0.00000127
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000105, std=0.00000133
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.017 >= min_delta = 0.0. New best score: 0.672


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00001667, std=0.00004006
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00001661, std=0.00003199
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00001852, std=0.00004296
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00002076, std=0.00004013
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001944, std=0.00003609
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005170, std=0.00008780
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001741, std=0.00004238
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002026, std=0.00003457
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00240997, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000175, std=0.00000248
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000176, std=0.00000226
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.052 >= min_delta = 0.0. New best score: 0.620


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00001998, std=0.00005566
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00002068, std=0.00004499
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00002371, std=0.00005775
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00002546, std=0.00005582
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001930, std=0.00003874
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005346, std=0.00009736
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001998, std=0.00005168
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002239, std=0.00004072
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00390910, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000339, std=0.00000587
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000298, std=0.00000375
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.025 >= min_delta = 0.0. New best score: 0.596


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00002497, std=0.00006470
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00002474, std=0.00005236
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00002821, std=0.00006307
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00003061, std=0.00006484
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001838, std=0.00003915
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00005061, std=0.00009746
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001919, std=0.00004771
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002306, std=0.00004468
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00979817, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00000626, std=0.00001167
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000696, std=0.00000880
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.593


[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00003580, std=0.00009193
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00003433, std=0.00007310
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00003962, std=0.00009062
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004199, std=0.00008914
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002302, std=0.00004783
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006272, std=0.00011532
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002430, std=0.00006319
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003051, std=0.00005699
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01898689, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001159, std=0.00002127
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000704, std=0.00000888
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00002921, std=0.00008450
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00003678, std=0.00008016
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00003881, std=0.00008495
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004490, std=0.00009751
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00001535, std=0.00003223
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00004347, std=0.00007790
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00001697, std=0.00004086
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00002312, std=0.00004237
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00936200, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001119, std=0.00002002
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00002058, std=0.00002615
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004202, std=0.00010423
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00004017, std=0.00008653
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004643, std=0.00010377
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004788, std=0.00010278
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002255, std=0.00004862
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006019, std=0.00011635
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002342, std=0.00005992
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003278, std=0.00006305
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00530466, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001793, std=0.00003329
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000686, std=0.00000871
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004378, std=0.00010435
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00004148, std=0.00008507
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004385, std=0.00008528
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00004935, std=0.00010070
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002271, std=0.00004813
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006211, std=0.00011724
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002349, std=0.00005369
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003361, std=0.00006386
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01923594, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00001780, std=0.00002914
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00001424, std=0.00001767
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00005546, std=0.00012263
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00005431, std=0.00009731
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005652, std=0.00010573
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00006212, std=0.00011042
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002509, std=0.00005317
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006953, std=0.00013305
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002562, std=0.00005841
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003742, std=0.00007174
[Batch 20] image_adapter 梯度 scale: mean_abs=0.03279749, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002454, std=0.00004205
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00000852, std=0.00001064
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004828, std=0.00010128
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00004787, std=0.00008212
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00004927, std=0.00008887
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00005462, std=0.00009349
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002354, std=0.00004901
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006620, std=0.00011945
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002325, std=0.00005010
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003475, std=0.00006303
[Batch 20] image_adapter 梯度 scale: mean_abs=0.00278873, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002318, std=0.00003903
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00002629, std=0.00003260
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00005609, std=0.00011597
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00005781, std=0.00009709
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005630, std=0.00009983
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00006581, std=0.00011098
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002506, std=0.00005301
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006896, std=0.00012779
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002620, std=0.00005883
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003653, std=0.00006830
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01327762, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002750, std=0.00004476
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003234, std=0.00003990
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00004613, std=0.00010077
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00005430, std=0.00009292
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00005056, std=0.00008867
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00006370, std=0.00010954
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002146, std=0.00004894
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00006158, std=0.00012511
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002201, std=0.00005845
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00003240, std=0.00006677
[Batch 20] image_adapter 梯度 scale: mean_abs=0.02000820, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00002476, std=0.00004027
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00004749, std=0.00005758
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00007131, std=0.00014284
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00007529, std=0.00012396
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00007135, std=0.00011812
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008656, std=0.00014224
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00002839, std=0.00005514
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00007698, std=0.00013094
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00002825, std=0.00006025
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00004030, std=0.00006917
[Batch 20] image_adapter 梯度 scale: mean_abs=0.01012217, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00003758, std=0.00005848
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003721, std=0.00004534
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

[Batch 20] image_projection 梯度 input_projection.0.weight: mean_abs=0.00007945, std=0.00016427
[Batch 20] image_projection 梯度 input_projection.0.bias: mean_abs=0.00008016, std=0.00013756
[Batch 20] image_projection 梯度 input_projection.1.weight: mean_abs=0.00006691, std=0.00011405
[Batch 20] image_projection 梯度 input_projection.1.bias: mean_abs=0.00008988, std=0.00015358
[Batch 20] text_projection 梯度 input_projection.0.weight: mean_abs=0.00003312, std=0.00006850
[Batch 20] text_projection 梯度 input_projection.0.bias: mean_abs=0.00009133, std=0.00016910
[Batch 20] text_projection 梯度 input_projection.1.weight: mean_abs=0.00003202, std=0.00007605
[Batch 20] text_projection 梯度 input_projection.1.bias: mean_abs=0.00004563, std=0.00008458
[Batch 20] image_adapter 梯度 scale: mean_abs=0.02154081, std=nan
[Batch 20] image_adapter 梯度 norm1.weight: mean_abs=0.00004023, std=0.00005859
[Batch 20] image_adapter 梯度 norm1.bias: mean_abs=0.00003401, std=0.00004144
[Batch 20] image_adapter 梯度 fc.0.weight: m

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_loss did not improve in the last 10 records. Best score: 0.593. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

Validation Metrics: [{'val_loss': 0.6088424921035767, 'val_acc': 0.7236841917037964, 'val_auroc': 0.7879348397254944, 'val_f1': 0.7236841917037964}]
Validation Accuracy: tensor(0.7237, device='cpu')
Validation AUROC: tensor(0.7879, device='cpu')
Validation F1 Score: tensor(0.7237, device='cpu')
