# CLIP interrogator + BLIP-2

## CLIP interrogator

In [None]:
#install the package
!pip install open_clip_torch
!pip install clip-interrogator==0.6.0
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting open_clip_torch
  Downloading open_clip_torch-2.16.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting timm
  Downloading timm-0.6.13-py3-none-any.whl (549 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 KB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinu

In [None]:
#install the dataset of competition
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c stable-diffusion-image-to-prompts

Saving kaggle.json to kaggle.json
Downloading stable-diffusion-image-to-prompts.zip to /content
  0% 0.00/3.04M [00:00<?, ?B/s]
100% 3.04M/3.04M [00:00<00:00, 181MB/s]


In [None]:
!unzip -o '/content/stable-diffusion-image-to-prompts.zip' -d '/content/'

Archive:  /content/stable-diffusion-image-to-prompts.zip
  inflating: /content/images/20057f34d.png  
  inflating: /content/images/227ef0887.png  
  inflating: /content/images/92e911621.png  
  inflating: /content/images/a4e1c55a9.png  
  inflating: /content/images/c98f79f71.png  
  inflating: /content/images/d8edf2e40.png  
  inflating: /content/images/f27825b2c.png  
  inflating: /content/prompts.csv    
  inflating: /content/sample_submission.csv  


In [None]:
# import packages
import torch
from PIL import Image
import open_clip
import inspect
import importlib
from clip_interrogator import clip_interrogator
from clip_interrogator import Config, Interrogator
from pathlib import Path
from sentence_transformers import SentenceTransformer, models

#selecting the CLIP model - ViT-g-14/laion2b_s34b_b88k
model, _, preprocess = open_clip.create_model_and_transforms('ViT-g-14',
                                                             pretrained = 'laion2b_s34b_b88k') 
tokenizer = open_clip.get_tokenizer('ViT-g-14')
st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)ip_pytorch_model.bin:   0%|          | 0.00/5.47G [00:00<?, ?B/s]

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
import pandas as pd
import numpy as np
import os

#bring images of sample submission file
sample_submission = pd.read_csv('/content/sample_submission.csv', index_col = 'imgId_eId')
images = os.listdir('/content/images')
image_ids = [i.split('.')[0] for i in images]
EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))
imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(image_ids, EMBEDDING_LENGTH), # [인덱스 0부터 6 384번 반복]
        np.tile(range(EMBEDDING_LENGTH), len(image_ids)) # [0 ~ 383, 0 ~ 383, ......]
    )
]
def make_batches(l, batch_size=16):
    for i in range(0, len(l), batch_size):
        yield l[i:i + batch_size]
        

In [None]:
ci = Interrogator(Config(clip_model_name = 'ViT-g-14/laion2b_s34b_b88k'))
mediums_features_array = torch.stack([torch.from_numpy(t) for t in ci.mediums.embeds])
movements_features_array = torch.stack([torch.from_numpy(t) for t in ci.movements.embeds])
flavors_features_array = torch.stack([torch.from_numpy(t) for t in ci.flavors.embeds])

Loading caption model blip-large...


Downloading (…)lve/main/config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Loading CLIP model ViT-g-14/laion2b_s34b_b88k...


Preprocessing artists: 100%|██████████| 5/5 [00:38<00:00,  7.68s/it]
Preprocessing flavors: 100%|██████████| 49/49 [06:15<00:00,  7.65s/it]
Preprocessing mediums: 100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
Preprocessing movements: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Preprocessing trendings: 100%|██████████| 1/1 [00:00<00:00,  3.90it/s]
Preprocessing negative: 100%|██████████| 1/1 [00:00<00:00,  6.89it/s]


Loaded CLIP model and data in 457.77 seconds.


In [None]:
BATCH_SIZE = 32
clip_text = []
cos = torch.nn.CosineSimilarity(dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for batch in make_batches(images, BATCH_SIZE):
  images_batch = []
  for i, image in enumerate(batch):
    images_batch.append(preprocess(Image.open('/content/images/'+image).convert('RGB')).unsqueeze(0))
  images_batch = torch.cat(images_batch, 0)

  with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(images_batch)
    image_features /= image_features.norm(dim = -1, keepdim = True)

  for i in range(len(image_features)):
    medium = [ci.mediums.labels[i] for i in cos(image_features[i], mediums_features_array).topk(1).indices][0]
    movement = [ci.movements.labels[i] for i in cos(image_features[i], movements_features_array).topk(1).indices][0]
    flaves = ', '.join([ci.flavors.labels[i] for i in cos(image_features[i], flavors_features_array).topk(3).indices])
    prompt = f'{medium}, {movement}, {flaves}'
    clip_text.append(prompt)
for i in clip_text:
  print(i)

a screenprint, lowbrow, robot!, rabbit robot, robot
a detailed painting, magic realism, oil canvas of lucifer, epic surrealism 8k oil painting, thomas blackshear and moebius
digital art, digital art, the mighty donut, at the counter, donut
digital art, conceptual art, american astronaut in the forest, astronaut walking, lonely astronaut
a woodcut, art nouveau, whorl, carved wood, swirl
a storybook illustration, digital art, nachosaurus, “a dinosaur market, pastry lizard
a digital painting, context art, planet arrakis, crater, looking down at a massive crater


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## BLIP-2

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration

processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')

Downloading (…)rocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

In [None]:
model.to(device)
BATCH_SIZE = 16
cap_list = []
for ix, batch in enumerate(make_batches(images, BATCH_SIZE)):
  images_batch = []
  for i, image in enumerate(batch):
    images_batch.append(Image.open('/content/images/'+image).convert('RGB'))
  pixel_values = processor(images = images_batch, return_tensors = 'pt').pixel_values.to(device)
  out = model.generate(pixel_values = pixel_values, max_length = 20, num_return_sequences = 5,
                       num_beams = 5, min_length = 5)
  prompts = processor.batch_decode(out, skip_special_tokens = True)
  for i in range(len(images_batch)):
    for j in range(5):
      caption = prompts[i * 5 + j]
      prompt = caption + clip_text[BATCH_SIZE * ix + i]
      cap_list.append(prompt)
for i in cap_list:
  print(i)

In [None]:
# Convert text to embeddings
submission_custom = st_model.encode(cap_list).flatten()
submission_custom = np.reshape(submission_custom, (-1, 5, 384)).mean(1).flatten()
print(len(submission_custom))
submission = (np.array(submission_custom))
print(len(submission))
print(len(imgId_eId))
submission = pd.DataFrame({'imgId_eId': imgId_eId,
                           'val' : submission})
submission.head()

In [None]:
images = os.listdir('/content/images')
imgIds = [i.split('.')[0] for i in images]
EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

assert sorted(imgId_eId) == sorted(submission.imgId_eId)
ground_truth = pd.read_csv('/content/prompts.csv')
ground_truth = pd.merge(pd.DataFrame(imgIds, columns = ['imgId']), ground_truth, 
                        on = 'imgId', how = 'left')
ground_truth_embeddings = st_model.encode(ground_truth.prompt).flatten()
gte = pd.DataFrame(
    index = imgId_eId,
    data = ground_truth_embeddings,
    columns = ['val']
).rename_axis('imgId_eId')

from scipy import spatial
vec1 = gte['val']
vec2 = submission['val']
cos_sim = 1 - spatial.distance.cosine(vec1, vec2)
print(cos_sim)

In [None]:
submission.to_csv("submission.csv", index=False)