<a href="https://www.kaggle.com/code/edgarolivares/model-comparison-openclip-vs-vit-gpt2?scriptVersionId=120920155" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# This notebook seeks for the best pretrained model.

The main idea is to create a notebook which compares results from different models. This can be helpful to pick the best initial model and for final results comparrison.

In this notebook, following models are compared:

1. **OpenClip** --> Based on https://www.kaggle.com/code/leonidkulyk/lb-0-42118-laion-s-coca-vit-openclip

2. **vit-gpt2-coco-en** --> https://huggingface.co/ydshieh/vit-gpt2-coco-en


In [1]:
wheels_path = "/kaggle/input/open-clip-wheels/open_clip_wheels"
open_clip_whl_path = f"{wheels_path}/open_clip_torch-2.14.0-py3-none-any.whl"
!pip install --no-index --find-links $wheels_path $open_clip_whl_path -q

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [2]:
import os
import sys
from PIL import Image
from pathlib import Path

import numpy as np
import pandas as pd
import torch

# MODELS
import open_clip
from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel

In [3]:
class CFG:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    seed = 42
    embedding_length = 384
    test_len = 7
    images_path = "../input/stable-diffusion-image-to-prompts/images/"
    sentence_model_path = "/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2"
    comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')  
    model_name = "coca_ViT-L-14"
    clip_model_checkpoint_path = "/kaggle/input/open-clip-models/mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k.bin"
    vit_gpt2_path = "ydshieh/vit-gpt2-coco-en"

images = os.listdir(CFG.comp_path / 'images')
    
df_ground_truth = pd.read_csv(CFG.comp_path / 'sample_submission.csv', index_col='imgId_eId')
df_ground_truth.head()

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
20057f34d_0,0.018848
20057f34d_1,0.03019
20057f34d_2,0.072792
20057f34d_3,-0.000673
20057f34d_4,0.016774


# Load Sentence Transformer for embeddings

In [4]:
sys.path.append('../input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models

st_model = SentenceTransformer(CFG.sentence_model_path)

prompts = pd.read_csv(CFG.comp_path / 'prompts.csv', index_col='imgId')

prompt_embeddings_ground_truth = st_model.encode(prompts['prompt']).flatten()
prompt_embeddings_ground_truth

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([0.01884847, 0.03018977, 0.07279219, ..., 0.03056298, 0.01404713,
       0.02376362], dtype=float32)

# Clip Model

Load model

In [5]:
clip_model = open_clip.create_model(CFG.model_name)
open_clip.load_checkpoint(clip_model, CFG.clip_model_checkpoint_path)

transform = open_clip.image_transform(
    clip_model.visual.image_size,
    is_train = False,
    mean = getattr(clip_model.visual, 'image_mean', None),
    std = getattr(clip_model.visual, 'image_std', None),
)

clip_model.to(CFG.device)
print(CFG.device)

cuda


Predict

In [6]:
prompts_open_clip = []

for image_name in images:
    img = Image.open(CFG.images_path + image_name).convert("RGB")
    img = transform(img).unsqueeze(0)

    with torch.no_grad(), torch.cuda.amp.autocast():
        generated = clip_model.generate(img.to(CFG.device))
    
    prompts_open_clip.append(
        open_clip.decode(
            generated[0]
        ).split("<end_of_text>")[0].replace("<start_of_text>", "").rstrip(" .,")
    )

#EMBEDDING
prompt_embeddings_open_clip = st_model.encode(prompts_open_clip).flatten()

df_clip = pd.DataFrame(
    index=df_ground_truth.index,
    data=prompt_embeddings_open_clip,
    columns=['val']
).rename_axis('imgId_eId')


df_clip.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
20057f34d_0,0.029225
20057f34d_1,0.053407
20057f34d_2,-0.084613
20057f34d_3,0.017963
20057f34d_4,-0.075529


# ViT-GPT2 Model

Load model

In [7]:
feature_extractor = ViTFeatureExtractor.from_pretrained(CFG.vit_gpt2_path)
tokenizer = AutoTokenizer.from_pretrained(CFG.vit_gpt2_path)
model = VisionEncoderDecoderModel.from_pretrained(CFG.vit_gpt2_path)
model.eval()

def predict(image):

    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

    with torch.no_grad():
        output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

    return preds

Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/120 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/937M [00:00<?, ?B/s]

Predict

In [8]:
prompts_vit_gpt2 = []
for image_name in images:
    image = Image.open(CFG.images_path + image_name).convert("RGB")
    preds = predict(image)
    prompts_vit_gpt2.append(preds[0])

#EMBEDDING
prompt_embeddings_vit_gpt2 = st_model.encode(prompts_vit_gpt2).flatten()

df_vit_gpt2 = pd.DataFrame(
    index=df_ground_truth.index,
    data=prompt_embeddings_vit_gpt2,
    columns=['val']
).rename_axis('imgId_eId')

df_vit_gpt2.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
20057f34d_0,0.020267
20057f34d_1,0.099165
20057f34d_2,-0.013597
20057f34d_3,-0.045828
20057f34d_4,-0.005268


# Let's compare results with cosine similiraity, shall we?

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean  

def cos_sim_test(df_ground_truth, df_test):
    cosine_similarities = []
    for i in range(CFG.test_len):
        current = cosine_similarity(
            df_ground_truth.iloc[i * CFG.embedding_length : (i + 1) * CFG.embedding_length]["val"].values.reshape(1, -1), 
            df_test.iloc[i * CFG.embedding_length : (i + 1) * CFG.embedding_length]["val"].values.reshape(1, -1)
        )

        cosine_similarities.append(float(current.squeeze()))
        
    return  cosine_similarities, mean(cosine_similarities)

In [10]:
sim_clip, mean_clip = cos_sim_test(df_ground_truth, df_clip)
sim_vit_gpt2, mean_vit_gpt2 = cos_sim_test(df_ground_truth, df_vit_gpt2)

# Print means for easy comparison purposes
mean_clip, mean_vit_gpt2

(0.09848827273902525, 0.1691441588588042)

In [11]:
print("ground truth: {}\n clip sentences:{}\n vit gpt2 sentences:{}"
      .format(prompts['prompt'], prompts_open_clip, prompts_vit_gpt2))

ground truth: imgId
20057f34d    hyper realistic photo of very friendly and dys...
227ef0887    ramen carved out of fractal rose ebony, in the...
92e911621    ultrasaurus holding a black bean taco in the w...
a4e1c55a9    a thundering retro robot crane inks on parchme...
c98f79f71    portrait painting of a shimmering greek hero, ...
d8edf2e40    an astronaut standing on a engaging white rose...
f27825b2c    Kaggle employee Phil at a donut shop ordering ...
Name: prompt, dtype: object
 clip sentences:['a man standing in front of a counter with a cake', 'an illustration of a dinosaur eating a piece of cheese', 'a drawing of a robot holding a remote control', 'an aerial view of a hole in the ground', 'a person in a spacesuit walking down a road', "a round wooden bowl with a design on it 's side", 'a painting of a man with a bird on his head']
 vit gpt2 sentences:['a man standing in front of a donut shop', 'a statue of a bear sitting on top of a pile of rocks', 'a drawing of a cat sitting 