In [1]:
import pandas as pd

import torch
import clip
from sklearn.metrics import accuracy_score, f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [2]:
data = torch.load('ViT-B-32-image-features.pth')
image_features = data.cuda()

df = pd.read_excel('dataset/razm.xlsx')
df['Время суток'] -= 1
df['Время года'] -= 1
df['Местность'] -= 1

In [11]:
tags = {
    'Время суток': {
        'День': 'Day',
        'Ночь': 'Night',
        'Рассвет/закат': 'Sunrise or sunset',
    },
    'Время года': {
        'Зима': 'Winter',
        'Весна': 'Spring',
        'Лето': 'Summer',
        'Осень': 'Autumn',
    },
    'Местность': {
        'Лес': 'Forest or field',
        'Город': 'City'
    },
    'Авиа': 'helicopter and not a drone',
    'Автомобили': 'Car with ordinary wheels',
    'БПЛА': 'Drone and not a helicopter',
    'Водолаз': 'Creepy diver',
    'Кинолог': 'Doggy',
    'Кони': 'Gentle horse',
    'Объятия': 'Embarrassed hugging',
    'Шерп': 'Square all terrain vehicle with big wheels'
}

groups = {}
for group, group_labels in tags.items():
    if type(group_labels) is dict:
        text = group_labels.values()
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    else:
        text = [f'No {group_labels.lower()}', group_labels]
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    groups[group] = text_features

In [12]:
with torch.no_grad():
    for group, text_features in groups.items():
        labels = df[group].values
        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        acc = accuracy_score(labels, pred)
        f1 = f1_score(labels, pred, average='macro')
        print(f'{group}, acc: {acc:.4}, f1: {f1:.4}')

Время суток, acc: 0.9556, f1: 0.9054
Время года, acc: 0.5779, f1: 0.4885
Местность, acc: 0.8056, f1: 0.7781
Авиа, acc: 0.8349, f1: 0.7853
Автомобили, acc: 0.8037, f1: 0.6002
БПЛА, acc: 0.9058, f1: 0.8717
Водолаз, acc: 0.951, f1: 0.8963
Кинолог, acc: 0.9476, f1: 0.8036
Кони, acc: 0.9511, f1: 0.8508
Объятия, acc: 0.9859, f1: 0.6653
Шерп, acc: 0.8964, f1: 0.7523


In [13]:
mask = (df['Названия'] == 'Шерп_00821.jpg')
imf = image_features[mask]

with torch.no_grad():
    for group, text_features in groups.items():
        pred = (100.0 * imf @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().item()
        tags_group = tags[group]
        if type(tags_group) is dict:
            print(f'{group}: {list(tags_group.keys())[pred]}')
        elif pred == 1:
            print(f'Есть {group.lower()}')

Время суток: Рассвет/закат
Время года: Осень
Местность: Лес
Есть шерп


In [12]:
from adj import adjectives

group = 'Автомобили'
labels = df[group].values

data = []
with torch.no_grad():
    for adj in adjectives + ['']:
        text = [f'No {adj} car', f'{adj} car']

        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        f1 = f1_score(labels, pred)
        acc = accuracy_score(labels, pred)
        data.append((acc, f1, adj))

sorted(data, key=lambda x: x[1], reverse=True)

[(0.4601859523151984, 0.36946236559139783, 'Cooperative'),
 (0.4730737365368683, 0.3453796889295517, 'Inquiring'),
 (0.41240909509343643, 0.34093959731543627, 'Careful'),
 (0.8190186872871215, 0.3271731690622861, 'Uptight'),
 (0.6020436343551505, 0.3254797940396318, 'Busy'),
 (0.33996133664733497, 0.31779257849666986, 'Terrible'),
 (0.3775200220933444, 0.31, 'Envious'),
 (0.2619902421062322, 0.2998864727971356, 'Bad'),
 (0.2618981865046488, 0.29826711009977247, 'Wrong'),
 (0.2736813035073184, 0.29452789699570814, 'Ashamed'),
 (0.4858694651569548, 0.29115369970808475, 'Amused'),
 (0.18586025959679647, 0.28015627543545496, 'Cautious'),
 (0.7566970450151892, 0.2672581092320488, 'Weary'),
 (0.7457424284267697, 0.2630736392742796, 'Hurt'),
 (0.5624597256743072, 0.26138306138306133, 'Ugly'),
 (0.25775568443339775, 0.2525261889311208, 'Evil'),
 (0.34695756236767006, 0.24803900784396862, 'Obedient'),
 (0.7638773819386909, 0.23546944858420268, 'Lively'),
 (0.7046856301205928, 0.2306954436450839

In [11]:
from adj import adjectives

group = 'Объятия'
labels = df[group].values

data = []
with torch.no_grad():
    for adj in adjectives + ['']:
        text = [f'No {adj} hugging', f'{adj} hugging']

        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        f1 = f1_score(labels, pred)
        acc = accuracy_score(labels, pred)
        data.append((acc, f1, adj))

sorted(data, key=lambda x: x[1], reverse=True)

[(0.9844426033324127, 0.5212464589235127, 'Pink'),
 (0.9847187701371628, 0.48125, 'Prickling'),
 (0.982417380097579, 0.41590214067278286, 'Agreeable'),
 (0.9733038755408266, 0.3438914027149321, 'Lively'),
 (0.977262266408911, 0.3413333333333333, 'Hurt'),
 (0.9859154929577465, 0.3376623376623376, 'Embarrassed'),
 (0.967964650648992, 0.31225296442687744, 'Tender'),
 (0.9656632606094081, 0.31053604436229204, 'Mysterious'),
 (0.966399705422075, 0.2940038684719536, 'Kind'),
 (0.952223142778238, 0.27412587412587414, 'Upset'),
 (0.9491853079259873, 0.2698412698412698, 'Helpless'),
 (0.9511184755592378, 0.2675862068965517, 'Red'),
 (0.9830617693086624, 0.25203252032520324, 'Sleepy'),
 (0.9620730921476572, 0.24817518248175185, 'Tired'),
 (0.9630857037650741, 0.2448210922787194, 'Adorable'),
 (0.9226732946699807, 0.2363636363636364, 'Puzzled'),
 (0.9307741876093161, 0.23577235772357724, 'Charming'),
 (0.9450428058547362, 0.23363286264441593, 'Careful'),
 (0.905458897173893, 0.22607385079125844, 

In [10]:
# g = 'Время года'
g = 'БПЛА'
# g = 'Кинолог'

text_features = groups[g]
labels = df[g].values
pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()
pred

# Remove invalid
mask = (labels != -1)
labels = labels[mask]
pred = pred[mask]

accuracy_score(labels, pred)

wrong_mask = (pred != labels)

print('pred', pred[wrong_mask])
print('label', labels[wrong_mask])

print(df['Названия'].values[mask][wrong_mask])

pred [1 1 1 ... 1 1 1]
label [0 0 0 ... 0 0 0]
['Закат_00006.jpg' 'Вертолет_01563.jpg' 'Вертолет_00843.jpg' ...
 'IMG_3374_00830.jpg' 'Вертолет_01387.jpg' 'Вертолет_00158.jpg']
