In [1]:
import os

os.environ["http_proxy"]="http://192.168.1.13:20171"
os.environ["https_proxy"]="http://192.168.1.13:20171"


In [2]:
from PIL import Image
from transformers import BlipProcessor, BlipModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
import numpy as np
import random

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # for reproducibility

# Set the seed to fix the projection layer weights
set_seed(42)

In [4]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipModel.from_pretrained("Salesforce/blip-vqa-base")

Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-vqa-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_model.encoder.layer

In [5]:
raw_image = Image.open("demo.png")
inputs = processor(raw_image, return_tensors="pt")
features = model.get_image_features(inputs['pixel_values'])
print(f"inputs['pixel_values']: {inputs['pixel_values'].shape}")
# print(f"inputs['pixel_values']: {inputs['pixel_values'][0]}")
print(features.shape)
print(features)

inputs['pixel_values']: torch.Size([1, 3, 384, 384])
torch.Size([1, 512])
tensor([[ 5.4370e-01, -1.0282e-01, -4.2045e-01, -7.7390e-03, -7.3555e-02,
         -7.7148e-02, -4.5607e-02,  3.9742e-01,  1.7652e-01, -2.0652e-01,
          2.7597e-01, -4.1239e-02,  3.3221e-01,  1.5150e-01, -6.3844e-03,
         -1.3276e-01, -1.0011e-01,  1.5942e-01, -6.2723e-02, -3.5384e-01,
         -1.2362e-01,  6.3965e-01,  4.1510e-01, -9.9639e-01,  2.9846e-01,
          2.3616e-01, -5.8012e-01,  8.0132e-01, -3.5859e-01, -1.0994e-01,
         -3.1600e-01, -6.7240e-01, -6.9920e-01, -7.9811e-02,  1.4866e-02,
         -1.4781e-02,  1.6229e-01,  3.8690e-01,  1.6235e-02, -2.8352e-01,
         -3.0121e-01, -2.0102e-01,  1.5626e-01, -1.7362e-01, -2.3560e-01,
         -1.4569e-01,  3.1506e-01,  6.9732e-01,  3.4207e-02,  9.4353e-02,
          5.4746e-01,  1.3270e-01, -1.1987e-01, -2.6232e-01, -2.3362e-01,
         -5.1732e-01,  3.0591e-01,  5.6168e-01, -5.5495e-01,  1.9668e-02,
         -1.5082e-01,  4.3294e-01,  3.

In [6]:
import torch
from pathlib import Path
import openvino as ov

VISION_MODEL_OV = Path("../blip_vqa_base/blip_vision_model.xml")
vision_model = model.vision_model
vision_model.eval()

# check that model works and save it outputs for reusage as text encoder input
with torch.no_grad():
    vision_outputs = vision_model(inputs["pixel_values"])

# if openvino model does not exist, convert it to IR
if not VISION_MODEL_OV.exists():
    # export pytorch model to ov.Model
    with torch.no_grad():
        ov_vision_model = ov.convert_model(vision_model, example_input=inputs["pixel_values"])
    # save model on disk for next usages
    ov.save_model(ov_vision_model, VISION_MODEL_OV)
    print(f"Vision model successfuly converted and saved to {VISION_MODEL_OV}")
else:
    print(f"Vision model will be loaded from {VISION_MODEL_OV}")

Vision model will be loaded from ..\blip_vqa_base\blip_vision_model.xml


In [7]:
import torch
from pathlib import Path
import openvino as ov

PROJECTION_MODEL_OV = Path("../blip_vqa_base/blip_vision_proj_model.xml")
visual_projection_model = model.visual_projection
visual_projection_model.eval()

vision_outputs = model.vision_model(pixel_values=inputs["pixel_values"], return_dict=None)
pooled_output = vision_outputs[1]  # pooled_output

# check that model works and save it outputs for reusage as text encoder input
with torch.no_grad():
    vision_outputs = visual_projection_model(pooled_output)

# if openvino model does not exist, convert it to IR
if not PROJECTION_MODEL_OV.exists():
    # export pytorch model to ov.Model
    with torch.no_grad():
        ov_vision_projection_model = ov.convert_model(visual_projection_model, example_input=pooled_output)
    # save model on disk for next usages
    ov.save_model(ov_vision_projection_model, PROJECTION_MODEL_OV)
    print(f"Projection model successfuly converted and saved to {PROJECTION_MODEL_OV}")
else:
    print(f"Projection model will be loaded from {PROJECTION_MODEL_OV}")

Projection model will be loaded from ..\blip_vqa_base\blip_vision_proj_model.xml


In [8]:
core = ov.Core()
ov_vision_model = core.compile_model(VISION_MODEL_OV, 'CPU')
ov_vision_projection_model = core.compile_model(PROJECTION_MODEL_OV, 'CPU')

In [9]:
pixel_values = inputs['pixel_values']
print(pixel_values.shape)
print(pixel_values[0])

torch.Size([1, 3, 384, 384])
tensor([[[ 0.8647,  0.9230,  0.9376,  ...,  1.7552,  1.7552,  1.7552],
         [ 0.9084,  0.9376,  0.9522,  ...,  1.7552,  1.7552,  1.7552],
         [ 0.9376,  0.9376,  0.9668,  ...,  1.7552,  1.7552,  1.7552],
         ...,
         [-0.7850, -0.7850, -0.7266,  ..., -0.3178, -0.2740, -0.3616],
         [-0.7558, -0.7558, -0.7412,  ..., -0.3178, -0.3616, -0.4346],
         [-0.7558, -0.7704, -0.7850,  ..., -0.3616, -0.4346, -0.4784]],

        [[ 1.2194,  1.2495,  1.2795,  ...,  1.8948,  1.8948,  1.8948],
         [ 1.2344,  1.2645,  1.2945,  ...,  1.8948,  1.8948,  1.8948],
         [ 1.2495,  1.2795,  1.3095,  ...,  1.8948,  1.8948,  1.8948],
         ...,
         [-0.5965, -0.5965, -0.5515,  ..., -0.4014, -0.3264, -0.4164],
         [-0.5665, -0.5665, -0.5515,  ..., -0.3864, -0.4164, -0.4914],
         [-0.5665, -0.5815, -0.5965,  ..., -0.4164, -0.4764, -0.5365]],

        [[ 1.2927,  1.3211,  1.3496,  ...,  1.9753,  1.9753,  1.9753],
         [ 1.321

In [10]:
def blip_preprocess(image: Image) -> np.ndarray:
    # convert to rgb
    image = image.convert("RGB")
    # resize to (384, 384)
    image = image.resize((384, 384), resample=Image.Resampling.BICUBIC)
    image.save("demo_resized.png")
    image = Image.open("demo_resized.png")
    image = np.array(image)
    # resacle (1 / 255)
    image = image / 255.0
    image = image.astype(np.float32)
    # normalize
    OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
    OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
    mean = np.array(OPENAI_CLIP_MEAN).astype(image.dtype)
    std = np.array(OPENAI_CLIP_STD).astype(image.dtype)

    image = (image - mean) / std
    # convert to channel first (num_channels, height, width)
    image = image.transpose((2, 0, 1))
    return image

custom_preprocess = blip_preprocess(raw_image)

res = np.expand_dims(custom_preprocess, axis=0)
print(res[0][0][0])

[0.8646511  0.9230448  0.93764323 0.95224166 0.99603695 1.0252337
 1.0398322  1.0690291  1.0836275  1.1128243  1.1274228  1.1566195
 1.171218   1.171218   1.2004149  1.2150133  1.2296118  1.2588086
 1.273407   1.273407   1.3026038  1.3026038  1.3463992  1.3463992
 1.3755959  1.3901944  1.4047928  1.4047928  1.4485881  1.4631865
 1.477785   1.4923834  1.5069818  1.5069818  1.5069818  1.5215802
 1.5215802  1.5361787  1.5507771  1.5507771  1.5507771  1.5653756
 1.5653756  1.5799739  1.5799739  1.5945723  1.6091708  1.5945723
 1.6091708  1.6091708  1.6237692  1.6383677  1.6383677  1.652966
 1.6383677  1.652966   1.652966   1.6675645  1.6675645  1.6821629
 1.6675645  1.6821629  1.6821629  1.6967614  1.7113597  1.7113597
 1.6967614  1.7113597  1.7259582  1.7259582  1.7259582  1.7405566
 1.7405566  1.7405566  1.7405566  1.7551551  1.7551551  1.7551551
 1.7551551  1.7697535  1.7697535  1.7697535  1.7697535  1.7697535
 1.7697535  1.7697535  1.7697535  1.7697535  1.7697535  1.7697535
 1.7843518 

In [11]:
print(custom_preprocess.shape)
preprocess_diff = custom_preprocess - pixel_values.numpy()
preprocess_diff = np.max(preprocess_diff)
print(preprocess_diff)

(3, 384, 384)
0.0


In [12]:
custom_preprocess = np.expand_dims(custom_preprocess, axis=0)
print(custom_preprocess.shape)

(1, 3, 384, 384)


In [13]:
vision_model_out = ov_vision_model.output(1)
pooled_embed = ov_vision_model(custom_preprocess)[vision_model_out]
print(pooled_embed.shape)
print(pooled_embed[0][:10])

image_features = ov_vision_projection_model(pooled_embed)[ov_vision_projection_model.output(0)]
print(image_features.shape)
print(image_features)

(1, 768)
[-0.3295024  -0.26385215 -0.0980585  -0.61664826 -0.25132835 -0.81028205
  0.8470111   0.04173357  0.5050232   0.30850348]
(1, 512)
[[ 5.43425918e-01 -1.02331668e-01 -4.19514924e-01 -7.79112149e-03
  -7.34343827e-02 -7.73895681e-02 -4.57431786e-02  3.97739679e-01
   1.76487401e-01 -2.06602097e-01  2.76051819e-01 -4.14276086e-02
   3.32250893e-01  1.51612788e-01 -6.69413898e-03 -1.32611036e-01
  -1.00314453e-01  1.59211293e-01 -6.27769902e-02 -3.53940874e-01
  -1.24040239e-01  6.40376210e-01  4.15159315e-01 -9.96035218e-01
   2.98053145e-01  2.36027554e-01 -5.80286205e-01  8.01462412e-01
  -3.58247548e-01 -1.09790623e-01 -3.15915465e-01 -6.72160566e-01
  -6.99690402e-01 -7.97545314e-02  1.46668889e-02 -1.49127794e-02
   1.61959171e-01  3.87137145e-01  1.58338659e-02 -2.83590466e-01
  -3.01553845e-01 -2.01093599e-01  1.56159341e-01 -1.74015045e-01
  -2.35688895e-01 -1.45444289e-01  3.14966381e-01  6.97640359e-01
   3.42270210e-02  9.42795873e-02  5.47776997e-01  1.32348001e-01
 