In [1]:
import pandas as pd

import torch
import clip
from sklearn.metrics import accuracy_score, f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [2]:
data = torch.load('ViT-B-32-image-features.pth')
image_features = data.cuda()

df = pd.read_excel('dataset/razm.xlsx')
df['Время суток'] -= 1
df['Время года'] -= 1
df['Местность'] -= 1

In [3]:
tags = {
    'Время суток': {
        'День': 'Day',
        'Ночь': 'Night',
        'Рассвет/закат': 'Sunrise or sunset',
    },
    'Время года': {
        'Зима': 'Winter',
        'Весна': 'Spring',
        'Лето': 'Summer',
        'Осень': 'Autumn',
    },
    'Местность': {
        'Лес': 'Forest',
        'Город': 'City'
    },
    'Авиа': 'Helicopter',
    'Автомобили': 'Car',
    'БПЛА': 'Sparkly drone',
    'Водолаз': 'Creepy diver',
    'Кинолог': 'Doggy',
    'Кони': 'Gentle horse',
    'Объятия': 'Embarrassed hugging',
    'Шерп': 'All terrain vehicle with big wheels'
}

groups = {}
for group, group_labels in tags.items():
    if type(group_labels) is dict:
        text = group_labels.values()
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    else:
        text = [f'No {group_labels.lower()}', group_labels]
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    groups[group] = text_features

In [9]:
with torch.no_grad():
    for group, text_features in groups.items():
        labels = df[group].values
        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        acc = accuracy_score(labels, pred)
        f1 = f1_score(labels, pred, average='macro')
        print(f'{group}, acc: {acc:.4}, f1: {f1:.4}')

Время суток, acc: 0.9556, f1: 0.9054
Время года, acc: 0.5779, f1: 0.4885
Местность, acc: 0.7761, f1: 0.7548
Авиа, acc: 0.6491, f1: 0.6138
Автомобили, acc: 0.5699, f1: 0.4502
БПЛА, acc: 0.8071, f1: 0.4897
Водолаз, acc: 0.951, f1: 0.8963
Кинолог, acc: 0.9476, f1: 0.8036
Кони, acc: 0.9511, f1: 0.8508
Объятия, acc: 0.9859, f1: 0.6653
Шерп, acc: 0.8361, f1: 0.7351


In [8]:
from adj import adjectives

group = 'БПЛА'
labels = df[group].values

data = []
with torch.no_grad():
    for adj in adjectives + ['']:
        text = [f'No {adj} drone', f'{adj} drone']

        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        f1 = f1_score(labels, pred)
        acc = accuracy_score(labels, pred)
        data.append((acc, f1, adj))

sorted(data, key=lambda x: x[1], reverse=True)

[(0.8257387462027065, 0.5684978345110554, 'Blue'),
 (0.6989781828224247, 0.526224282816575, 'Panicked'),
 (0.7686642732210255, 0.5119440668090891, 'Smoggy'),
 (0.6260701463684065, 0.5015950920245399, 'Obnoxious'),
 (0.5951394642363987, 0.47404927050944745, 'Witty'),
 (0.6781736168645862, 0.46413243408951566, 'Agreeable'),
 (0.5660498941360582, 0.4613802559414992, 'Cute'),
 (0.5330939887692167, 0.45649378482640374, 'Careful'),
 (0.5353033232072172, 0.4509462693060692, 'Charming'),
 (0.5240725398140477, 0.44988295381996163, 'Smiling'),
 (0.5272024302678818, 0.44762314476231446, 'Cheerful'),
 (0.5378808800515511, 0.44010707115770686, 'Shiny'),
 (0.5353033232072172, 0.4382372579568217, 'Helpful'),
 (0.49166896805670623, 0.43468468468468474, 'Funny'),
 (0.52029826014913, 0.43340219636838095, 'Defiant'),
 (0.6662984442603332, 0.433150899139953, 'Tired'),
 (0.5180889257111295, 0.4288052373158756, 'Condemned'),
 (0.4980208045659578, 0.42714570858283435, 'Clean'),
 (0.4698517904814508, 0.424272

In [11]:
from adj import adjectives

group = 'Объятия'
labels = df[group].values

data = []
with torch.no_grad():
    for adj in adjectives + ['']:
        text = [f'No {adj} hugging', f'{adj} hugging']

        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        f1 = f1_score(labels, pred)
        acc = accuracy_score(labels, pred)
        data.append((acc, f1, adj))

sorted(data, key=lambda x: x[1], reverse=True)

[(0.9844426033324127, 0.5212464589235127, 'Pink'),
 (0.9847187701371628, 0.48125, 'Prickling'),
 (0.982417380097579, 0.41590214067278286, 'Agreeable'),
 (0.9733038755408266, 0.3438914027149321, 'Lively'),
 (0.977262266408911, 0.3413333333333333, 'Hurt'),
 (0.9859154929577465, 0.3376623376623376, 'Embarrassed'),
 (0.967964650648992, 0.31225296442687744, 'Tender'),
 (0.9656632606094081, 0.31053604436229204, 'Mysterious'),
 (0.966399705422075, 0.2940038684719536, 'Kind'),
 (0.952223142778238, 0.27412587412587414, 'Upset'),
 (0.9491853079259873, 0.2698412698412698, 'Helpless'),
 (0.9511184755592378, 0.2675862068965517, 'Red'),
 (0.9830617693086624, 0.25203252032520324, 'Sleepy'),
 (0.9620730921476572, 0.24817518248175185, 'Tired'),
 (0.9630857037650741, 0.2448210922787194, 'Adorable'),
 (0.9226732946699807, 0.2363636363636364, 'Puzzled'),
 (0.9307741876093161, 0.23577235772357724, 'Charming'),
 (0.9450428058547362, 0.23363286264441593, 'Careful'),
 (0.905458897173893, 0.22607385079125844, 

In [10]:
# g = 'Время года'
g = 'БПЛА'
# g = 'Кинолог'

text_features = groups[g]
labels = df[g].values
pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()
pred

# Remove invalid
mask = (labels != -1)
labels = labels[mask]
pred = pred[mask]

accuracy_score(labels, pred)

wrong_mask = (pred != labels)

print('pred', pred[wrong_mask])
print('label', labels[wrong_mask])

print(df['Названия'].values[mask][wrong_mask])

pred [1 1 1 ... 1 1 1]
label [0 0 0 ... 0 0 0]
['Закат_00006.jpg' 'Вертолет_01563.jpg' 'Вертолет_00843.jpg' ...
 'IMG_3374_00830.jpg' 'Вертолет_01387.jpg' 'Вертолет_00158.jpg']
