In [1]:
import torch
from PIL import Image
import open_clip
import numpy as np
import pandas as pd
from sklearn.metrics import top_k_accuracy_score
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Torch device: ', device)

Torch device:  cuda


In [3]:
base_model = 'RN50'
model, _, preprocess = open_clip.create_model_and_transforms(base_model, pretrained=f'./logs/{base_model}_RS_FineTuned_50epochs/checkpoints/epoch_50.pt')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer(base_model)

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [4]:
randomseed = 420

In [5]:
test_data = pd.read_csv('../datasets/NWPU-Captions/test.csv', sep='\t')
test_data = test_data.drop_duplicates(subset=['caption'])
test_data_onedesc = test_data.groupby('filepath', as_index=False).sample(1, random_state=randomseed).reset_index(drop=True)

In [6]:
images = []
captions = []
for i, row in test_data_onedesc.iterrows():
    images.append(Image.open(row['filepath']))
    captions.append(row['caption'])

In [7]:
proc_images = []
for img in images:
    proc_images.append(preprocess(img))

In [8]:
image_input = torch.tensor(np.stack(proc_images), dtype=torch.float)
text_tokens = tokenizer(captions)

In [9]:
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()

In [10]:
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T

In [11]:
y_desc_true = np.arange(len(test_data_onedesc['caption'].values))

top_k_stats = []
# Description-wise
top_k_stats.append({'k': 1, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=1)})
top_k_stats.append({'k': 3, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=3)})
top_k_stats.append({'k': 5, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=5)})
top_k_stats.append({'k': 10, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=10)})
top_k_stats.append({'k': 20, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=20)})
top_k_stats.append({'k': 30, 'score': top_k_accuracy_score(y_desc_true, similarity.T, k=30)})

In [12]:
top_k_stats

[{'k': 1, 'score': np.float64(0.05140021269053527)},
 {'k': 3, 'score': np.float64(0.1446295639844027)},
 {'k': 5, 'score': np.float64(0.2119815668202765)},
 {'k': 10, 'score': np.float64(0.3491669620701879)},
 {'k': 20, 'score': np.float64(0.554767812832329)},
 {'k': 30, 'score': np.float64(0.7029422190712513)}]

In [13]:
avg_correct_cos = 0
for i, sim in enumerate(similarity.T):
    avg_correct_cos += sim[y_desc_true[i]]

avg_correct_cos /= len(similarity.T)
avg_correct_cos

np.float32(0.73704326)

2821

(2821, 2821)