In [1]:
import pandas as pd

import torch
import clip
from sklearn.metrics import accuracy_score, precision_score, recall_score

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [2]:
data = torch.load('ViT-B-32-image-features.pth')
image_features = data.cuda()

df = pd.read_excel('dataset/razm.xlsx')
df['Время суток'] -= 1
df['Время года'] -= 1
df['Местность'] -= 1

In [3]:
tags = {
    'Время суток': {
        'День': 'Day',
        'Ночь': 'Night',
        'Рассвет/закат': 'Sunrise or sunset',
    },
    'Время года': {
        'Зима': 'Winter',
        'Весна': 'Spring',
        'Лето': 'Summer',
        'Осень': 'Autumn',
    },
    'Местность': {
        'Лес': 'Forest',
        'Город': 'City'
    },
    'Авиа': 'Helicopter',
    'Автомобили': 'Car',
    'БПЛА': 'Quadrocopter',
    'Водолаз': 'Creepy diver',
    'Кинолог': 'Doggy',
    'Кони': 'Gentle horse',
    'Объятия': 'Embarrassed hugging',
    'Шерп': 'All terrain vehicle with big wheels'
}

groups = {}
for group, group_labels in tags.items():
    if type(group_labels) is dict:
        text = group_labels.values()
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    else:
        text = [f'No {group_labels.lower()}', group_labels]
        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    groups[group] = text_features

In [4]:
with torch.no_grad():
    for group, text_features in groups.items():
        labels = df[group].values
        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        acc = accuracy_score(labels, pred)
        precision = precision_score(labels, pred, average='micro')
        recall = recall_score(labels, pred, average='micro')
        print(f'{group}, acc: {acc:.4}, prec: {precision:.4}, recall: {recall:.4}')

Время суток, acc: 0.9556, prec: 0.9556, recall: 0.9556
Время года, acc: 0.5779, prec: 0.5779, recall: 0.5779
Местность, acc: 0.7761, prec: 0.7761, recall: 0.7761
Авиа, acc: 0.6491, prec: 0.6491, recall: 0.6491
Автомобили, acc: 0.5699, prec: 0.5699, recall: 0.5699
БПЛА, acc: 0.5794, prec: 0.5794, recall: 0.5794
Водолаз, acc: 0.951, prec: 0.951, recall: 0.951
Кинолог, acc: 0.9476, prec: 0.9476, recall: 0.9476
Кони, acc: 0.9511, prec: 0.9511, recall: 0.9511
Объятия, acc: 0.9859, prec: 0.9859, recall: 0.9859
Шерп, acc: 0.8361, prec: 0.8361, recall: 0.8361


In [5]:
from adj import adjectives

group = 'Водолаз'
labels = df[group].values

data = []
with torch.no_grad():
    for adj in adjectives:
        text = [f'No {adj} diver', f'{adj} diver']

        tokens = clip.tokenize(text).to(device)
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

        # Remove invalid
        mask = (labels != -1)
        labels = labels[mask]
        pred = pred[mask]

        precision = precision_score(labels, pred)
        recall = recall_score(labels, pred)
        acc = accuracy_score(labels, pred)
        data.append((acc, precision, recall, adj))

sorted(data, key=lambda x: x[0], reverse=True)

[(0.9641903709840743, 0.8652037617554859, 0.888030888030888, 'Naughty'),
 (0.9510264199576545, 0.8603667136812412, 0.7850707850707851, 'Creepy'),
 (0.9324311884378165, 0.7755376344086021, 0.7425997425997426, 'Mysterious'),
 (0.9319709104298997, 0.7916964924838941, 0.7117117117117117, 'Grumpy'),
 (0.9262634631317316, 0.6681554265297007, 0.9626769626769627, 'Poor'),
 (0.9072079536039768, 0.6530269058295964, 0.7496782496782497, 'Obnoxious'),
 (0.9022369511184756, 0.705685618729097, 0.5431145431145431, 'Tired'),
 (0.9010402282978919, 0.6391632771644393, 0.7078507078507078, 'Enchanted'),
 (0.8964374482187241, 0.6125984251968504, 0.750965250965251, 'Itchy'),
 (0.8931234465617233, 0.6039132734003173, 0.7348777348777349, 'Lively'),
 (0.8903617785142226, 0.593029215786776, 0.7445302445302445, 'Unusual'),
 (0.8892571112952223, 0.6514236410698878, 0.48584298584298585, 'Defiant'),
 (0.8867716100524717, 0.5819009100101112, 0.7406692406692407, 'Precious'),
 (0.8854828316303047, 0.5580524344569289, 0

In [31]:
group = 'Кони'
labels = df[group].values

text = [f'No horse', f'horse']

tokens = clip.tokenize(text).to(device)
text_features = model.encode_text(tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)

pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()

# Remove invalid
mask = (labels != -1)
labels = labels[mask]
pred = pred[mask]

acc = accuracy_score(labels, pred)
print(group, acc)

Кони 0.6387738193869097


In [10]:
# g = 'Время года'
g = 'БПЛА'
# g = 'Кинолог'

text_features = groups[g]
labels = df[g].values
pred = (100.0 * image_features @ text_features.T).softmax(dim=-1).argmax(dim=1).cpu().numpy()
pred

# Remove invalid
mask = (labels != -1)
labels = labels[mask]
pred = pred[mask]

accuracy_score(labels, pred)

wrong_mask = (pred != labels)

print('pred', pred[wrong_mask])
print('label', labels[wrong_mask])

print(df['Названия'].values[mask][wrong_mask])

pred [1 1 1 ... 1 1 1]
label [0 0 0 ... 0 0 0]
['Закат_00006.jpg' 'Вертолет_01563.jpg' 'Вертолет_00843.jpg' ...
 'IMG_3374_00830.jpg' 'Вертолет_01387.jpg' 'Вертолет_00158.jpg']
