In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration, AutoProcessor, Blip2ForImageTextRetrieval
from dataset import COCODataset
from awq.quantizer import Blip2ForConditionalGenerationAWQQuantizer
from inference_pipeline import InferencePipeline

import time

In [3]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
device

device(type='cuda')

## AWQ Blip-2 Caption Generation

In [4]:
# SETUP
model_name = "Salesforce/blip2-opt-2.7b"
model = Blip2ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

processor = Blip2Processor.from_pretrained(model_name)

# NOTE: set paths as appropriate
# Will sample n_samples from dataset to create calibration set
coco_dataset = COCODataset(ann_file='/nfshomes/vla/project_dirs/low-bit-vision/datasets/cocow/annotations/captions_val2017.json',
                           img_dir='/nfshomes/vla/project_dirs/low-bit-vision/datasets/cocow/images/val2017')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [6]:
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task = 'image_captioning', max_samples = 1)
results

  0%|          | 0/1 [00:00<?, ?it/s]Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


{'predictions': [{'image_id': 397133,
   'caption': 'a woman in a kitchen with a man in a kitchen'}],
 'references': [['A man is in a kitchen making pizzas.',
   'Man in apron standing on front of oven with pans and bakeware',
   'A baker is working in the kitchen rolling dough.',
   'A person standing by a stove in a kitchen.',
   'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']]}

In [7]:
# sample config, load from JSON or smth
# model_part: bit_width

config = {}
config['vit_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'fc1': 4,
    'fc2': 4,
}

# config['qformer_layers'] = {
#     'self_attn': 4,
#     'self_attn_output':4,
#     'intermediate_query':4,
#     'output_query': 4,
#     'cross_attn': 4,
#     'cross_attn_output': 4
# }

config['llm_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'fc1':4,
    'fc2':4
}

In [8]:
# Apply AWQ
quantizer = Blip2ForConditionalGenerationAWQQuantizer(model, device, processor, coco_dataset, config)

start = time.time()
quantizer.quantize()
print(f'Quantization time: {time.time() - start:.2f} seconds')

Quantizing vit_layers: 100%|██████████| 39/39 [05:08<00:00,  7.90s/it]
Quantizing llm_layers: 100%|██████████| 32/32 [05:30<00:00, 10.34s/it]

Quantization time: 647.47 seconds





In [9]:
# NOTE:make sure to move model back to device, quantizing moves layers around to save memory 
model.to(device)
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task = 'image_captioning', max_samples = 1)
results

100%|██████████| 1/1 [00:00<00:00,  2.41it/s]


{'predictions': [{'image_id': 397133,
   'caption': 'a woman standing in a kitchen with pots and pans on the counter'}],
 'references': [['A man is in a kitchen making pizzas.',
   'Man in apron standing on front of oven with pans and bakeware',
   'A baker is working in the kitchen rolling dough.',
   'A person standing by a stove in a kitchen.',
   'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']]}

In [13]:
# NOTE: below ran quantizing all parts to 4-bit, i.e config['qformer_layers'] also provided
# NOTE: full AWQ quantization at 4 bits still seems to degrade captions to garbage


config = {}
config['vit_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'fc1': 4,
    'fc2': 4,
}

config['qformer_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'intermediate_query':4,
    'output_query': 4,
    'cross_attn': 4,
    'cross_attn_output': 4
}

config['llm_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'fc1':4,
    'fc2':4
}

# Apply AWQ
quantizer = Blip2ForConditionalGenerationAWQQuantizer(model, device, processor, coco_dataset, config)

start = time.time()
quantizer.quantize()
print(f'Quantization time: {time.time() - start:.2f} seconds')

Quantizing vit_layers: 100%|██████████| 39/39 [05:02<00:00,  7.77s/it]
Quantizing qformer_layers: 100%|██████████| 12/12 [00:19<00:00,  1.58s/it]
Quantizing llm_layers: 100%|██████████| 32/32 [05:28<00:00, 10.26s/it]

Quantization time: 658.00 seconds





In [15]:
# NOTE:make sure to move model back to device, quantizing moves layers around to save memory 
model.to(device)
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task = 'image_captioning', max_samples = 1)
results

100%|██████████| 1/1 [00:00<00:00,  3.87it/s]


{'predictions': [{'image_id': 397133, 'caption': 'K, and thes.'}],
 'references': [['A man is in a kitchen making pizzas.',
   'Man in apron standing on front of oven with pans and bakeware',
   'A baker is working in the kitchen rolling dough.',
   'A person standing by a stove in a kitchen.',
   'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']]}

## AWQ Blip-2 Info-Retrieval

In [4]:
model_name = "Salesforce/blip2-itm-vit-g"
model = Blip2ForImageTextRetrieval.from_pretrained(model_name, torch_dtype=torch.float16)
model.to(device)
processor = AutoProcessor.from_pretrained(model_name)

In [5]:
model

Blip2ForImageTextRetrieval(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((140

In [6]:
config = {}
config['vit_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'fc1': 4,
    'fc2': 4,
}

config['qformer_layers'] = {
    'self_attn': 4,
    'self_attn_output':4,
    'intermediate_txt': 4,
    'output_txt': 4,
    'intermediate_query':4,
    'output_query': 4,
    'cross_attn': 4,
    'cross_attn_output': 4,
    # 'vision_proj':4,
    # 'txt_proj':4,
    # 'itm_head': 4,
}

In [7]:
from dataset import Flickr30kEvalDataset
import torchvision.transforms as transforms
# import torchvision.transforms.InterpolationMode as InterpolationMode


ann_file = '/nfshomes/vla/project_dirs/low-bit-vision/datasets/flickr30k/annotations/test.json'
img_dir = '/nfshomes/vla/project_dirs/low-bit-vision/datasets/flickr30k/images/flickr30k-images'


img_transform = transforms.Compose(
    [
        transforms.Resize(
            (224, 224), interpolation=transforms.InterpolationMode.BICUBIC
        ),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]
)

img_transform


Compose(
    Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [8]:
flickr_dataset = Flickr30kEvalDataset(ann_file, img_dir, img_transform=None)

In [9]:
w = model.qformer.encoder.layer[10].output.dense.weight

In [24]:
with torch.no_grad():
    w = w.to(torch.float64)

In [16]:
w

Parameter containing:
tensor([[-7.1526e-07,  7.7486e-07, -5.9605e-08,  ..., -1.9073e-06,
         -1.7881e-07,  8.9407e-07],
        [ 4.5300e-06,  2.3246e-06,  3.3975e-06,  ..., -7.7486e-07,
          1.1921e-07,  2.8610e-06],
        [ 4.7684e-07,  4.1127e-06, -3.9339e-06,  ...,  2.2650e-06,
          3.2783e-06,  3.9339e-06],
        ...,
        [ 6.5565e-07,  1.1921e-06,  5.4240e-06,  ...,  2.0862e-06,
         -1.4305e-06, -2.3842e-07],
        [ 1.8477e-06, -1.7285e-06, -4.7684e-07,  ..., -6.4969e-06,
         -4.5300e-06, -5.3644e-07],
        [ 2.3842e-06,  1.5497e-06, -2.2650e-06,  ..., -3.3975e-06,
          2.1458e-06,  2.0862e-06]], device='cuda:0', dtype=torch.float16,
       requires_grad=True)

In [17]:
w.shape

torch.Size([768, 3072])

In [18]:
assert torch.isnan(w).sum() == 0

In [19]:
assert torch.isinf(w).sum() == 0

AssertionError: 

In [28]:
w_bits = 4

In [12]:
max_val = w.amax(dim=1, keepdim=True)
assert torch.isnan(max_val).sum() == 0
assert torch.isinf(max_val).sum() == 0
max_val

tensor([[4.1699e-01],
        [1.7590e-01],
        [2.1228e-01],
        [3.5062e+01],
        [2.0962e-03],
        [1.7266e+01],
        [6.1812e+01],
        [1.3750e+02],
        [4.8950e-01],
        [5.3101e-02],
        [1.6438e+02],
        [2.3812e+02],
        [4.5654e-01],
        [1.4162e+02],
        [1.7925e+02],
        [2.1050e+02],
        [3.4657e-03],
        [1.3685e-03],
        [1.9250e-01],
        [3.6963e-01],
        [4.5074e-02],
        [3.2623e-02],
        [1.9934e-01],
        [3.6438e+01],
        [2.3315e-01],
        [8.0234e+00],
        [3.3836e-03],
        [2.2078e+01],
        [5.6738e-01],
        [8.4562e+01],
        [8.9438e+01],
        [2.2575e+02],
        [1.1086e-05],
        [3.3179e-01],
        [4.2844e+01],
        [5.7500e+01],
        [8.4521e-01],
        [1.1493e-01],
        [1.4550e+02],
        [8.1484e+00],
        [1.1981e-04],
        [9.8944e-06],
        [2.4255e-01],
        [7.6580e-04],
        [1.0012e+02],
        [1

In [35]:
min_val = w.amin(dim=1, keepdim=True)
assert torch.isnan(min_val).sum() == 0
min_val


tensor([[-2.6925e+02],
        [-1.1388e+02],
        [-9.7688e+01],
        [-1.5503e-01],
        [-8.5688e+01],
        [-5.3070e-02],
        [-5.9296e-02],
        [-1.2976e-01],
        [-4.3150e+02],
        [-4.3550e+02],
        [-7.1478e-04],
        [-6.6345e-02],
        [-2.2938e+02],
        [-3.0727e-03],
        [-5.8556e-04],
        [-1.8225e-01],
        [-8.7125e+01],
        [-1.9650e+02],
        [-1.9225e+02],
        [-6.4550e+02],
        [-9.2562e+01],
        [-1.1169e+02],
        [-3.3062e+01],
        [-8.8074e-02],
        [-1.1106e+02],
        [-6.6614e-04],
        [-1.0244e+02],
        [-8.0347e-04],
        [-1.9888e+02],
        [-7.8918e-02],
        [-7.2632e-02],
        [-6.4148e-02],
        [-5.7469e+01],
        [-1.3888e+02],
        [-1.1568e-03],
        [-5.5313e-04],
        [-6.5375e+01],
        [-2.4625e+02],
        [-8.0872e-04],
        [-1.4524e-03],
        [-3.8625e+02],
        [-1.8012e+02],
        [-9.2062e+01],
        [-2

In [46]:
torch.isinf(torch.tensor([float('-inf')]))

tensor([True])

In [31]:
max_int = 2**w_bits - 1
min_int = 0
scales = (max_val - min_val).clamp(min=1e-5) / max_int

assert torch.isnan(scales).sum() == 0

In [41]:
test = min_val / scales
test

tensor([[-1.4996e+01],
        [-1.4996e+01],
        [-1.4994e+01],
        [-3.9710e-02],
        [-1.5000e+01],
        [-2.7638e-02],
        [-8.6356e-03],
        [-8.4994e-03],
        [-1.4997e+01],
        [-1.5000e+01],
        [-3.9166e-05],
        [-2.5095e-03],
        [-1.4995e+01],
        [-1.9540e-04],
        [-2.9425e-05],
        [-7.7956e-03],
        [-1.5000e+01],
        [-1.5000e+01],
        [-1.4997e+01],
        [-1.4998e+01],
        [-1.4999e+01],
        [-1.4999e+01],
        [-1.4984e+01],
        [-2.1749e-02],
        [-1.4994e+01],
        [-7.4835e-04],
        [-1.5000e+01],
        [-3.2794e-04],
        [-1.4992e+01],
        [-8.4058e-03],
        [-7.3084e-03],
        [-2.5604e-03],
        [-1.5000e+01],
        [-1.4993e+01],
        [-2.4332e-04],
        [-8.6652e-05],
        [-1.4965e+01],
        [-1.4999e+01],
        [-5.0101e-05],
        [-1.6053e-03],
        [-1.5000e+01],
        [-1.5000e+01],
        [-1.4993e+01],
        [-1

In [39]:
torch.round(min_val / scales)

tensor([[-15.],
        [-15.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [ -0.],
        [ -0.],
        [-15.],
        [ -0.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [-15.],
        [-15.],
        [-15.],
        [-15.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [-15.],
        [-15.],
        [ -0.],
        [ -0.],
        [ -0.],
        [ -0.],
        [ -0.],
        [ -0.],
        [-15.],
        [ -0.],
        [ -0.],
        [-15.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        [-15.],
        [ -0.],
        

In [36]:
zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)

assert torch.isnan(zeros).sum() == 0


AssertionError: 

In [16]:
q_w = (
    torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros
) * scales

In [17]:
madge = q_w.unique()
madge

tensor([-3.1934, -2.0527, -1.5742,  ...,  0.4292,  0.4929,  0.5723],
       device='cuda:0', dtype=torch.float16, grad_fn=<Unique2Backward0>)

In [18]:
assert torch.isnan(q_w).sum() == 0

In [17]:
torch.tensor([1,2,3]).data.to(torch.float32)

tensor([1., 2., 3.])

In [10]:
from awq.quantizer import Blip2ForImageTextRetrievalAWQQuantizer
quantizer = Blip2ForImageTextRetrievalAWQQuantizer(model, device, processor, flickr_dataset, config)

In [11]:
quantizer.quantize()

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Quantizing vit_layers:   0%|          | 0/39 [00:00<?, ?it/s]
Quantizing qformer_layers: 100%|██████████| 12/12 [00:32<00:00,  2.70s/it]


In [12]:
model

Blip2ForImageTextRetrieval(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((140

In [29]:
model.to(device)
flickr_dataset.img_transform = img_transform

pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(flickr_dataset, task = 'image_text_retrieval')
results



Getting text embeddings


100%|██████████| 1250/1250 [00:11<00:00, 104.18it/s]


Getting image embeddings


100%|██████████| 1000/1000 [00:53<00:00, 18.58it/s]


Calculating i2t score matrix


100%|██████████| 1000/1000 [01:14<00:00, 13.38it/s]


Calculating t2i score matrix


100%|██████████| 5000/5000 [09:17<00:00,  8.97it/s]


{'scores_i2t': array([[-100., -100.,   nan, ..., -100., -100., -100.],
        [-100., -100.,   nan, ..., -100., -100., -100.],
        [-100., -100.,   nan, ..., -100., -100., -100.],
        ...,
        [-100., -100.,   nan, ..., -100., -100., -100.],
        [-100., -100.,   nan, ..., -100., -100., -100.],
        [-100., -100.,   nan, ..., -100., -100., -100.]], dtype=float32),
 'scores_t2i': array([[-100.        , -100.        , -100.        , ..., -100.        ,
         -100.        , -100.        ],
        [-100.        , -100.        , -100.        , ..., -100.        ,
         -100.        , -100.        ],
        [          nan,           nan,           nan, ..., -100.        ,
         -100.        , -100.        ],
        ...,
        [-100.        , -100.        , -100.        , ..., -100.        ,
         -100.        , -100.        ],
        [-100.        , -100.        , -100.        , ...,    0.98516846,
            1.3724365 , -100.        ],
        [-100.   

In [30]:
from scoring_pipeline import ScoringPipeline

scoring_pipeline = ScoringPipeline()

Adding current path to python system paths


In [31]:
scoring_pipeline._compute_retrieval_scores(results)

{'txt_r1': 0.1,
 'txt_r5': 0.5,
 'txt_r10': 0.8,
 'txt_r_mean': 0.4666666666666666,
 'img_r1': 0.02,
 'img_r5': 0.12,
 'img_r10': 0.36,
 'img_r_mean': 0.16666666666666666,
 'r_mean': 0.31666666666666665,
 'agg_metrics': 0.4666666666666666}