In [1]:
import torch
import random
import numpy as np

import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from data.builders import *
from model import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *

2024-07-02 14:41:00.769747: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
def set_random_seed(seed, deterministic=False):
    """Set random seed.

    Args:
        seed (int): Seed to be used.
        deterministic (bool): Whether to set the deterministic option for
            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
            to True and `torch.backends.cudnn.benchmark` to False.
            Default: False.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Albef

In [100]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 384, 384).to("cuda:0")
model = load_model("albef_vqa", model_type="vqav2").to("cuda:0")
samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "cat", "dog", "dog", "dog"],
    "weight": torch.tensor([0.25, 0.25, 0.25, 0.25, 1.0]).to("cuda:0"),
    "n_answers": torch.tensor([4, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output.loss)

INFO - 2024-07-01 12:31:49,267 - base_model - Missing keys []
INFO - 2024-07-01 12:31:49,268 - base_model - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt


output.loss:  tensor(7.4633, device='cuda:0', grad_fn=<DivBackward0>)


In [68]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 384, 384).to("cuda:0")
model = load_model("albef_vqa", model_type="vqav2").to("cuda:0").eval()
samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5, 0.5, 1.0]).to("cuda:0"),
    "n_answers": torch.tensor([2, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output.loss)

samples_new = {
    "image": torch.stack([image_rand[0], image_rand[0], image_rand[1]]),
    "text_input": ["What is this?", "What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5 * 3/2, 0.5 * 3/2, 1.0 * 3/2]).to("cuda:0"),
    "n_answers": torch.tensor([1, 1, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output_new = model(samples_new)
print("output_new.loss: ", output_new.loss)

INFO - 2024-07-01 13:39:07,171 - base_model - Missing keys []
INFO - 2024-07-01 13:39:07,172 - base_model - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt


output.loss:  tensor(7.4977, device='cuda:0', grad_fn=<DivBackward0>)
output_new.loss:  tensor(7.4977, device='cuda:0', grad_fn=<DivBackward0>)


In [14]:
tokenized_text = model.tokenizer(
    ["What is this?", "What is this?", "What is that?"],
    padding="longest",
    truncation=True,
    max_length=1000,
    return_tensors="pt",
).to("cuda:0")

tokenized_text

{'input_ids': tensor([[ 101, 2054, 2003, 2023, 1029,  102],
        [ 101, 2054, 2003, 2023, 1029,  102],
        [ 101, 2054, 2003, 2008, 1029,  102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [24]:
tokenized_text_one = model.tokenizer(
    ["What is this?"],
    padding="longest",
    truncation=True,
    max_length=1000,
    return_tensors="pt",
).to("cuda:0")

tokenized_text_one

{'input_ids': tensor([[ 101, 2054, 2003, 2023, 1029,  102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [35]:
output_new.intermediate_output.image_embeds[[0]]

tensor([[[-0.1084,  0.1659,  0.2482,  ..., -0.4964, -0.0917,  0.2431],
         [ 0.4797,  0.4930,  0.3895,  ..., -0.7556, -0.4877,  0.5836],
         [ 1.0337,  0.6387,  0.9262,  ..., -0.4521, -0.3721, -0.4231],
         ...,
         [ 0.5401,  0.1490,  0.0636,  ..., -1.2835,  0.4325, -0.9287],
         [ 0.0058,  0.1974, -0.5898,  ..., -0.8813,  0.2463,  0.0333],
         [-0.0853,  0.0534, -0.6248,  ..., -1.1418,  0.4026,  0.0579]]],
       device='cuda:0', grad_fn=<IndexBackward0>)

In [34]:
output_new.intermediate_output.image_embeds

tensor([[[-0.1084,  0.1659,  0.2482,  ..., -0.4964, -0.0917,  0.2431],
         [ 0.4797,  0.4930,  0.3895,  ..., -0.7556, -0.4877,  0.5836],
         [ 1.0337,  0.6387,  0.9262,  ..., -0.4521, -0.3721, -0.4231],
         ...,
         [ 0.5401,  0.1490,  0.0636,  ..., -1.2835,  0.4325, -0.9287],
         [ 0.0058,  0.1974, -0.5898,  ..., -0.8813,  0.2463,  0.0333],
         [-0.0853,  0.0534, -0.6248,  ..., -1.1418,  0.4026,  0.0579]],

        [[-0.1084,  0.1659,  0.2482,  ..., -0.4964, -0.0917,  0.2431],
         [ 0.4797,  0.4930,  0.3895,  ..., -0.7556, -0.4877,  0.5836],
         [ 1.0337,  0.6387,  0.9262,  ..., -0.4521, -0.3721, -0.4231],
         ...,
         [ 0.5401,  0.1490,  0.0636,  ..., -1.2835,  0.4325, -0.9287],
         [ 0.0058,  0.1974, -0.5898,  ..., -0.8813,  0.2463,  0.0333],
         [-0.0853,  0.0534, -0.6248,  ..., -1.1418,  0.4026,  0.0579]],

        [[-0.0857,  0.0592,  0.2299,  ..., -0.5149, -0.0368,  0.2147],
         [ 0.5761,  0.4005,  0.2458,  ..., -0

In [20]:
output_new.intermediate_output.image_embeds.shape

torch.Size([3, 577, 768])

In [42]:
set_random_seed(24, True)
model.text_encoder.forward_automask(
    tokenized_text=tokenized_text, visual_embeds=output_new.intermediate_output.image_embeds
)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]],

        [[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]],

        [[ 0.0860,  0.2904,  0.4131,  ..., -0.3347,  0.0586, -0.5755],
         [

In [43]:
set_random_seed(24, True)
model.text_encoder.forward_automask(
    tokenized_text=tokenized_text_one, visual_embeds=output_new.intermediate_output.image_embeds[[1]]
)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=None, hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

In [81]:
output.intermediate_output.keys()

odict_keys(['image_embeds', 'image_embeds_m', 'encoder_output', 'encoder_output_m', 'decoder_output', 'decoder_labels'])

In [51]:
output.intermediate_output.encoder_output.last_hidden_state

tensor([[[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]],

        [[ 0.0860,  0.2904,  0.4131,  ..., -0.3347,  0.0586, -0.5755],
         [ 0.0639,  0.1792, -0.1922,  ..., -0.1420,  0.1322, -0.0664],
         [ 0.2262,  0.1518, -0.1012,  ..., -0.0654,  0.2100,  0.1151],
         [ 0.1013,  0.0876, -0.0941,  ..., -0.1563,  0.2485,  0.1559],
         [-0.3125,  0.0323, -0.2332,  ...,  0.0400,  0.2050, -0.0297],
         [-0.2555,  0.0622, -0.2154,  ...,  0.0544,  0.2151,  0.0068]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [52]:
output_new.intermediate_output.encoder_output.last_hidden_state

tensor([[[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]],

        [[-0.0211,  0.1497,  0.3861,  ..., -0.3361,  0.0171, -0.4375],
         [-0.0165,  0.2702, -0.2013,  ...,  0.1000,  0.1935, -0.0024],
         [ 0.1865,  0.1759,  0.0120,  ...,  0.1826,  0.3152,  0.2304],
         [ 0.1856,  0.1138,  0.0652,  ..., -0.0163,  0.2820,  0.1953],
         [-0.3360,  0.0219, -0.2263,  ...,  0.0550,  0.2021, -0.0053],
         [-0.2757,  0.0398, -0.2054,  ...,  0.0777,  0.2185,  0.0145]],

        [[ 0.0860,  0.2904,  0.4131,  ..., -0.3347,  0.0586, -0.5755],
         [ 0.0639,  0.1792, -0.1922,  ..., -0.1420,  0.1322, -0.0664],
  

In [54]:
output.intermediate_output.decoder_output.logits

tensor([[[-5.3261, -5.1228, -5.1128,  ..., -5.0842, -5.0993, -5.0917],
         [-7.4742, -7.7043, -7.6620,  ..., -7.6386, -7.7235, -7.6797],
         [-2.5261, -2.5918, -2.5996,  ..., -2.5963, -2.5992, -2.5989]],

        [[-5.3261, -5.1228, -5.1128,  ..., -5.0842, -5.0993, -5.0917],
         [-7.3359, -7.6161, -7.5978,  ..., -7.5692, -7.6225, -7.6111],
         [-2.5266, -2.5933, -2.6008,  ..., -2.5978, -2.6008, -2.6003]],

        [[-5.4162, -5.2113, -5.2096,  ..., -5.1702, -5.1884, -5.1868],
         [-7.3218, -7.6313, -7.6345,  ..., -7.6032, -7.6584, -7.6454],
         [-2.5435, -2.6079, -2.6154,  ..., -2.6123, -2.6143, -2.6142]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [55]:
output.intermediate_output.decoder_output.loss

tensor([8.3667, 8.0175, 6.8032], device='cuda:0', grad_fn=<AddBackward0>)

In [64]:
sum(output.intermediate_output.decoder_output.loss * torch.tensor([0.5, 0.5, 1.0]).to("cuda:0")) / 2

tensor(7.4977, device='cuda:0', grad_fn=<DivBackward0>)

In [58]:
output.loss

tensor(7.4977, device='cuda:0', grad_fn=<DivBackward0>)

In [56]:
output_new.intermediate_output.decoder_output.logits

tensor([[[-5.3261, -5.1228, -5.1128,  ..., -5.0842, -5.0993, -5.0917],
         [-7.4742, -7.7043, -7.6620,  ..., -7.6386, -7.7235, -7.6797],
         [-2.5261, -2.5918, -2.5996,  ..., -2.5963, -2.5992, -2.5989]],

        [[-5.3261, -5.1228, -5.1128,  ..., -5.0842, -5.0993, -5.0917],
         [-7.3359, -7.6161, -7.5978,  ..., -7.5692, -7.6226, -7.6111],
         [-2.5266, -2.5933, -2.6008,  ..., -2.5978, -2.6008, -2.6003]],

        [[-5.4162, -5.2113, -5.2096,  ..., -5.1702, -5.1884, -5.1868],
         [-7.3218, -7.6314, -7.6345,  ..., -7.6032, -7.6584, -7.6454],
         [-2.5435, -2.6079, -2.6154,  ..., -2.6123, -2.6143, -2.6142]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [57]:
output_new.intermediate_output.decoder_output.loss

tensor([8.3667, 8.0175, 6.8032], device='cuda:0', grad_fn=<AddBackward0>)

In [65]:
sum(output.intermediate_output.decoder_output.loss * torch.tensor([0.5, 0.5, 1.0]).to("cuda:0")) / 3

tensor(4.9984, device='cuda:0', grad_fn=<DivBackward0>)

In [59]:
output_new.loss

tensor(4.9984, device='cuda:0', grad_fn=<DivBackward0>)

In [69]:
set_random_seed(24, True)

image_rand = torch.rand(1, 3, 384, 384).to("cuda:0")
model = load_model("albef_vqa", model_type="vqav2").to("cuda:0").eval()
samples = {
    "image": image_rand,
    "text_input": ["What is this?"],
    "answer": ["cat", "dog"],
    "weight": torch.tensor([0.5, 0.5]).to("cuda:0"),
    "n_answers": torch.tensor([2]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output.loss)

samples_new = {
    "image": torch.stack([image_rand[0], image_rand[0]]),
    "text_input": ["What is this?", "What is this?"],
    "answer": ["cat", "dog"],
    "weight": torch.tensor([0.5 * 2, 0.5 * 2]).to("cuda:0"),
    "n_answers": torch.tensor([1, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output_new = model(samples_new)
print("output_new.loss: ", output_new.loss)

INFO - 2024-07-01 13:47:17,642 - base_model - Missing keys []
INFO - 2024-07-01 13:47:17,643 - base_model - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt


output.loss:  tensor(8.1921, device='cuda:0', grad_fn=<DivBackward0>)
output_new.loss:  tensor(8.1921, device='cuda:0', grad_fn=<DivBackward0>)


In [71]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 384, 384).to("cuda:0")
model = load_model("albef_vqa", model_type="vqav2").to("cuda:0").eval()
samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "cat", "cat", "cat", "dog"],
    "weight": torch.tensor([0.25, 0.25, 0.25, 0.25, 1.0]).to("cuda:0"),
    "n_answers": torch.tensor([4, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output.loss)

samples_new = {
    "image": torch.stack([image_rand[0], image_rand[0], image_rand[0], image_rand[0], image_rand[1]]),
    "text_input": ["What is this?", "What is this?", "What is this?", "What is this?", "What is that?"],
    "answer": ["cat", "cat", "cat", "cat", "dog"],
    "weight": torch.tensor([0.25 * 5/2, 0.25 * 5/2, 0.25 * 5/2, 0.25 * 5/2, 1.0 * 5/2]).to("cuda:0"),
    "n_answers": torch.tensor([1, 1, 1, 1, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output_new = model(samples_new)
print("output_new.loss: ", output_new.loss)

INFO - 2024-07-01 13:48:01,359 - base_model - Missing keys []
INFO - 2024-07-01 13:48:01,360 - base_model - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt


output.loss:  tensor(7.5849, device='cuda:0', grad_fn=<DivBackward0>)
output_new.loss:  tensor(7.5849, device='cuda:0', grad_fn=<DivBackward0>)


In [66]:
sum(sum(sum(sum(torch.stack([image_rand[0], image_rand[1]]) != image_rand))))

tensor(0, device='cuda:0')

# Blip2_t5

In [6]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")
model = load_model("blip2_t5_vqa", model_type="pretrain_flant5xl").to("cuda:0").eval()
samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5, 0.5, 1.0]).to("cuda:0"),
    "n_answers": torch.tensor([2, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output["loss"])

samples_new = {
    "image": torch.stack([image_rand[0], image_rand[0], image_rand[1]]),
    "text_input": ["What is this?", "What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5 * 3/2, 0.5 * 3/2, 1.0 * 3/2]).to("cuda:0"),
    "n_answers": torch.tensor([1, 1, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output_new = model(samples_new)
print("output_new.loss: ", output_new["loss"])

INFO - 2024-07-01 14:08:45,683 - blip2_t5_vqa - freeze vision encoder


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO - 2024-07-01 14:08:54,545 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth


output.loss:  tensor(6.3660, device='cuda:0', grad_fn=<NllLossBackward0>)
output_new.loss:  tensor(6.3660, device='cuda:0', grad_fn=<NllLossBackward0>)


In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "answer": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("blip2_t5_vqa", model_type="pretrain_flant5xl").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

INFO - 2024-07-01 15:55:52,813 - blip2_t5_vqa - freeze vision encoder


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO - 2024-07-01 15:56:03,790 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth


output.loss:  tensor(5.9060, device='cuda:0', grad_fn=<NllLossBackward0>)


In [4]:
set_random_seed(24, True)

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "text_output": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model_blip2_t5 = load_model("blip2_t5", model_type="pretrain_flant5xl").to("cuda:0").eval()
output_blip2_t5 = model_blip2_t5(samples)
print("output_blip2_t5.loss: ", output_blip2_t5["loss"])

INFO - 2024-07-01 15:56:30,011 - blip2_t5 - freeze vision encoder


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO - 2024-07-01 15:56:39,585 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth


output_blip2_t5.loss:  tensor(5.9062, device='cuda:0', grad_fn=<NllLossBackward0>)


# Blip2_opt

In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")
model = load_model("blip2_opt_vqa", model_type="pretrain_opt2.7b").to("cuda:0").eval()
samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5, 0.5, 1.0]).to("cuda:0"),
    "n_answers": torch.tensor([2, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output = model(samples)
print("output.loss: ", output["loss"])

samples_new = {
    "image": torch.stack([image_rand[0], image_rand[0], image_rand[1]]),
    "text_input": ["What is this?", "What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5 * 3/2, 0.5 * 3/2, 1.0 * 3/2]).to("cuda:0"),
    "n_answers": torch.tensor([1, 1, 1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}
output_new = model(samples_new)
print("output_new.loss: ", output_new["loss"])

INFO - 2024-07-01 16:23:05,968 - blip2_opt_vqa - freeze vision encoder
INFO - 2024-07-01 16:23:10,665 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth


output.loss:  tensor(12.1484, device='cuda:0', grad_fn=<NllLossBackward0>)
output_new.loss:  tensor(12.1471, device='cuda:0', grad_fn=<NllLossBackward0>)


In [4]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "answer": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("blip2_opt_vqa", model_type="pretrain_opt2.7b").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

INFO - 2024-07-01 16:03:44,197 - blip2_opt_vqa - freeze vision encoder
INFO - 2024-07-01 16:03:48,855 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth


output.loss:  tensor(9.8770, device='cuda:0', grad_fn=<NllLossBackward0>)


In [4]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "text_output": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model_blip2_opt = load_model("blip2_opt", model_type="pretrain_opt2.7b").to("cuda:0").eval()
output_blip2_opt = model_blip2_opt(samples)
print("output_blip2_opt.loss: ", output_blip2_opt["loss"])

INFO - 2024-07-01 16:06:54,926 - blip2_opt - freeze vision encoder
INFO - 2024-07-01 16:06:59,216 - blip2 - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth


output_blip2_opt.loss:  tensor(3.3452, device='cuda:0', grad_fn=<NllLossBackward0>)


# PaliGemma

In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "answer": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

questions ['What is this?']
answers ['cat']
weight tensor([1], device='cuda:0')
n_answers tensor([1], device='cuda:0')
output.loss:  tensor(3.6730, device='cuda:0', grad_fn=<NllLossBackward0>)


In [4]:
output["outputs"]

PaliGemmaCausalLMOutputWithPast(loss=tensor(3.6730, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[-1.5483, 10.9163, -8.9859,  ..., -2.1964, -2.2345, -2.2782],
         [-7.8914, 12.2948, -8.2089,  ..., -5.5079, -5.5658, -5.6053],
         [-9.0410, 11.1091,  1.7776,  ..., -6.8556, -6.9023, -6.9447],
         ...,
         [ 4.0500, 13.8408,  3.0179,  ...,  3.9606,  3.8709,  3.8631],
         [-8.9864, 16.6010, -6.0495,  ..., -6.8220, -6.8987, -6.9506],
         [-7.3626, 16.0273, -6.2692,  ..., -6.3612, -6.4248, -6.4581]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 2.7882e+00,  4.7940e+00,  5.4377e+00,  ..., -2.5283e+00,
           -1.5273e+00,  2.1634e+00],
          [ 2.5082e+00,  1.1873e-01, -1.6473e-03,  ..., -3.5429e+00,
           -2.4204e+00,  7.6921e-01],
          [ 9.7635e-01, -1.6019e+00, -1.5272e+00,  ..., -3.6681e+00,
           -2.1308e+00, -3.5332e-01],
          ...,
          [-4.9001e+00, -6.8408e-01,  2.297

In [14]:
all_proc = model.processor(text=samples['text_input'], images=samples['image'], suffix=samples['answer'], return_tensors="pt")
all_proc

{'input_ids': tensor([[257152, 257152, 257152,  ...,    108,   4991,      1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 1, 1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[[[-0.9939, -0.9947, -0.9964,  ..., -0.9976, -0.9985, -0.9990],
          [-0.9933, -0.9941, -0.9958,  ..., -0.9974, -0.9987, -0.9993],
          [-0.9922, -0.9928, -0.9943,  ..., -0.9970, -0.9990, -0.9999],
          ...,
          [-0.9994, -0.9993, -0.9993,  ..., -0.9972, -0.9955, -0.9947],
          [-0.9981, -0.9982, -0.9984,  ..., -0.9961, -0.9944, -0.9936],
          [-0.9974, -0.9976, -0.9980,  ..., -0.9956, -0.9940, -0.9931]],

         [[-0.9952, -0.9962, -0.9985,  ..., -0.9966, -0.9945, -0.9935],
          [-0.9957, -0.9965, -0.9983,  ..., -0.9964, -0.9952, -0.9947],
          [-0.9968, -0.9972, -0.9980,  ..., -0.9958, -0.9968, -0.9972],
          ...,
          [-0.9987, -0.9977, -0.9945,  ..., -0.9949, -0.9965, -0.9972],
          [-0.9945, -0.9954, -0.9970,  ..., 

In [15]:
model.processor(text=samples['text_input']*2, images=torch.stack([image_rand[0], image_rand[0]]), suffix=samples['answer']*2, return_tensors="pt")

{'input_ids': tensor([[257152, 257152, 257152,  ...,    108,   4991,      1],
        [257152, 257152, 257152,  ...,    108,   4991,      1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 1, 1],
        [0, 0, 0,  ..., 0, 1, 1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[[[-0.9939, -0.9947, -0.9964,  ..., -0.9976, -0.9985, -0.9990],
          [-0.9933, -0.9941, -0.9958,  ..., -0.9974, -0.9987, -0.9993],
          [-0.9922, -0.9928, -0.9943,  ..., -0.9970, -0.9990, -0.9999],
          ...,
          [-0.9994, -0.9993, -0.9993,  ..., -0.9972, -0.9955, -0.9947],
          [-0.9981, -0.9982, -0.9984,  ..., -0.9961, -0.9944, -0.9936],
          [-0.9974, -0.9976, -0.9980,  ..., -0.9956, -0.9940, -0.9931]],

         [[-0.9952, -0.9962, -0.9985,  ..., -0.9966, -0.9945, -0.9935],
          [-0.9957, -0.9965, -0.9983,  ..., -0.9964, -0.9952, -0.9947],
          [-0.9968, -0.9972, -0.9980,  ..., -0.9958, -0.9968, -0.9972],

In [13]:
model.processor(text=samples['text_input'], images=samples['image'], return_tensors="pt")

{'input_ids': tensor([[257152, 257152, 257152,  ...,    736, 235336,    108]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[[[-0.9939, -0.9947, -0.9964,  ..., -0.9976, -0.9985, -0.9990],
          [-0.9933, -0.9941, -0.9958,  ..., -0.9974, -0.9987, -0.9993],
          [-0.9922, -0.9928, -0.9943,  ..., -0.9970, -0.9990, -0.9999],
          ...,
          [-0.9994, -0.9993, -0.9993,  ..., -0.9972, -0.9955, -0.9947],
          [-0.9981, -0.9982, -0.9984,  ..., -0.9961, -0.9944, -0.9936],
          [-0.9974, -0.9976, -0.9980,  ..., -0.9956, -0.9940, -0.9931]],

         [[-0.9952, -0.9962, -0.9985,  ..., -0.9966, -0.9945, -0.9935],
          [-0.9957, -0.9965, -0.9983,  ..., -0.9964, -0.9952, -0.9947],
          [-0.9968, -0.9972, -0.9980,  ..., -0.9958, -0.9968, -0.9972],
          ...,
          [-0.9987, -0.9977, -0.9945,  ..., -0.9949, -0.9965, -0.9972],
          [-0.9945, -0.9954, -0.9970,  ..., -0.9948, -0.9958, -0.9963],
          [-0.9925, -0.994

In [8]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 64, 64)

samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5, 0.5, 1.0]),
    "n_answers": torch.tensor([2, 1]),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

output.loss:  tensor(3.7783, grad_fn=<NllLossBackward0>)


In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 64, 64)

samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog", "dog"],
    "weight": torch.tensor([0.5, 0.5, 1.0]),
    "n_answers": torch.tensor([2, 1]),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.97 GiB. GPU 0 has a total capacity of 44.38 GiB of which 2.47 GiB is free. Including non-PyTorch memory, this process has 41.91 GiB memory in use. Of the allocated memory 39.80 GiB is allocated by PyTorch, and 532.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 64, 64)

samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog"],
    "weight": torch.tensor([1.0, 1.0]),
    "n_answers": torch.tensor([1, 1]),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

output.loss:  tensor(3.6914, grad_fn=<NllLossBackward0>)


In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 64, 64)

samples = {
    "image": image_rand,
    "text_input": ["What is this?", "What is that?"],
    "answer": ["cat", "dog"],
    "weight": torch.tensor([1.0, 1.0]),
    "n_answers": torch.tensor([1, 1]),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)
output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)
output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)
output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)
output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)
output.loss:  tensor(3.6914, device='cuda:0', grad_fn=<NllLossBackward0>)


In [3]:
set_random_seed(24, True)

image_rand = torch.rand(2, 3, 224, 224).to("cuda:0")

samples = {
    "image": image_rand[[0]],
    "text_input": ["What is this?"],
    "answer": ["cat"],
    "weight": torch.tensor([1]).to("cuda:0"),
    "n_answers": torch.tensor([1]).to("cuda:0"),
    "epoch": 0, "iters": 0, "num_iters_per_epoch": 1000,
}

model = load_model("paligemma_vqa", model_type="paligemma-3b-ft-vqav2-448").to("cuda:0").eval()
output = model(samples)
print("output.loss: ", output["loss"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

output.loss:  tensor(3.6730, device='cuda:0', grad_fn=<NllLossBackward0>)


In [5]:
samples["image"].shape

torch.Size([2, 3, 64, 64])

In [7]:
samples["text_input"]

['What is this?', 'What is that?']

In [10]:
img_txt = model.processor(text=samples["text_input"], images=samples["image"]).to("cuda:0")
img_txt.keys(), img_txt['input_ids'].shape, img_txt['attention_mask'].shape, img_txt['pixel_values'].shape

(dict_keys(['input_ids', 'attention_mask', 'pixel_values']),
 torch.Size([2, 1030]),
 torch.Size([2, 1030]),
 torch.Size([2, 3, 448, 448]))

In [11]:
img_txt_ans = model.processor(
    text=samples["text_input"], 
    images=samples["image"], 
    suffix=["cat", "dog"]
).to("cuda:0")

img_txt_ans.keys(), img_txt_ans['input_ids'].shape, img_txt_ans['attention_mask'].shape, img_txt_ans['pixel_values'].shape

(dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'labels']),
 torch.Size([2, 1032]),
 torch.Size([2, 1032]),
 torch.Size([2, 3, 448, 448]))

In [13]:
img_txt_ans['token_type_ids'], img_txt_ans['labels']

(tensor([[0, 0, 0,  ..., 0, 1, 1],
         [0, 0, 0,  ..., 0, 1, 1]], device='cuda:0'),
 tensor([[ -100,  -100,  -100,  ...,  -100,  4991,     1],
         [ -100,  -100,  -100,  ...,  -100, 12240,     1]], device='cuda:0'))

In [12]:
img_txt['input_ids'], img_txt_ans['input_ids']

(tensor([[257152, 257152, 257152,  ...,    736, 235336,    108],
         [257152, 257152, 257152,  ...,    674, 235336,    108]],
        device='cuda:0'),
 tensor([[257152, 257152, 257152,  ...,    108,   4991,      1],
         [257152, 257152, 257152,  ...,    108,  12240,      1]],
        device='cuda:0'))

In [14]:
img_txt['attention_mask'], img_txt_ans['attention_mask']

(tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'))

In [18]:
sum(sum(sum(sum(img_txt['pixel_values'] != img_txt_ans['pixel_values']))))

tensor(0, device='cuda:0')

In [4]:
!nvidia-smi

Tue Jul  2 14:39:29 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:01:00.0 Off |                    0 |
|  0%   30C    P0    72W / 300W |  39228MiB / 46068MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
a = ["cat", "cat", "dog", "dog", "dog"]
max(set(a), key=a.count)