# Understanding Visual Concepts

This notebook contains code for understanding and predicting visual attributes using various pre-trained vision models. It focuses on the Visual Attributes in the Wild (VAW) dataset and includes functionalities for extracting attributes, setting up models, computing embeddings, and preparing data for training and testing.

## Setup

### Install Required Libraries

In [1]:
! /opt/homebrew/Cellar/python@3.9/3.9.18_2/bin/python3.9 -m pip install torch torchvision transformers Pillow numpy

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip[0m


In [2]:
import json
from PIL import Image, ImageDraw
import numpy as np
import torch
import torch.nn as nn
from torchvision.models.feature_extraction import create_feature_extractor
from transformers import ViTImageProcessor, ViTModel

  from .autonotebook import tqdm as notebook_tqdm


## Functions for extracting info from json files for tasks on attributes

In [3]:
def get_attributes(vaw_att, train_json_all):
    total_attr = 0
    all_attrl = []
    for vk in vaw_att.keys():
      total_attr += len(vaw_att[vk])
      all_attrl.append(vaw_att[vk])
      # print(vk, len(vaw_att[vk]), vaw_att[vk][:10])
    all_attr = [item for sublist in all_attrl for item in sublist]

    missing_attrs = []
    for jd in train_json_all:
        for pa in jd['positive_attributes']:
            if pa not in all_attr:
                missing_attrs.append(pa)

    all_attrl = all_attr + missing_attrs 
    print(total_attr, len(all_attrl)) #, len(set(all_attr)))
    return all_attrl, total_attr


def get_ids_pa(json_all):
    # This is very slow
    all_ids = []
    for jd in json_all:
        if jd['image_id'] not in all_ids:
            all_ids.append(jd['image_id'])

    all_pa = np.zeros((len(all_ids), len(all_attrl)))
    for jd in json_all:
      inds = [all_attrl.index(pa) for pa in jd['positive_attributes']]
      all_pa[all_ids.index(jd['image_id']),inds] += 1

    return all_ids, all_pa

def get_ids_pna(json_all):
    # This is very slow
    all_ids = []
    for jd in json_all:
        if jd['image_id'] not in all_ids:
            all_ids.append(jd['image_id'])

    all_pa = np.zeros((len(all_ids), len(all_attrl)))
    for jd in json_all:
      inds = [all_attrl.index(pa) for pa in jd['positive_attributes']]
      all_pa[all_ids.index(jd['image_id']),inds] += 1

    all_na = np.zeros((len(all_ids), len(all_attrl)))
    for jd in json_all:
      inds = [all_attrl.index(pa) for pa in jd['negative_attributes']]
      all_na[all_ids.index(jd['image_id']),inds] += 1

    return all_ids, all_pa, all_na

## Load json files – extract ids

In [4]:
# Load json files
train_json_file1 = "data/train_part1.json"
with open(train_json_file1, "r") as train_file1:
    train_json1 = json.load(train_file1)

train_json_file2 = "data/train_part2.json"
with open(train_json_file2, "r") as train_file2:
    train_json2 = json.load(train_file2)

train_json_all = train_json1 + train_json2

val_file_json = "data/val.json"
with open(val_file_json, "r") as val_file:
    val_json = json.load(val_file)

attribute_file_json="data/attribute_types.json"
with open(attribute_file_json, 'r') as attribute_file:
    vaw_att = json.load(attribute_file)

# fname="data/attribute_index.json"
# with open(fname, 'r') as f:
#    vaw_atti = json.load(f)

# process json files
all_attrl, total_attr = get_attributes(vaw_att, train_json_all)

# Get ids
all_ids, all_pa = get_ids_pa(train_json_all) 

all_ids_val, all_pa_val = get_ids_pa(val_json)

# Save the results to files --- TO RUN -- only for first time 
with open("output/all_ids.json", "w") as fp:
      json.dump(all_ids, fp)
np.save('output/all_pa',all_pa)

with open("output/all_ids_val.json", "w") as fp:
      json.dump(all_ids_val, fp)
np.save('output/all_pa_val',all_pa_val)

652 3010


## Setup / import model with hooks

In [5]:
device = "cpu"

model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
model.to(device)
feature_extractor = ViTImageProcessor.from_pretrained("facebook/dino-vitb16")
impath = "data/VG_100K/"

def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model.norm.register_forward_hook(get_activation("norm"))

Using cache found in /Users/smarru/.cache/torch/hub/facebookresearch_dinov2_main


<torch.utils.hooks.RemovableHandle at 0x15b74ad90>

## Get embeddings

In [None]:
save_counter=500
save_path="output/"

emb = []
lactivation = []
ids = []
all_inputs = []

for k, ui in enumerate(all_ids):
    im = Image.open(impath + ui + ".jpg")
    activation = {}
    if k > 0 and k % save_counter == 0:
        # all_inputs = torch.stack(all_inputs).to(device)
        # emb_all = dinov2_vitb14(all_inputs.squeeze())
        # lactivation_all = activation["norm"]
        emb_all = torch.stack(emb)
        lactivation_all = torch.stack(lactivation)
        print("Saving at ... " + str(k) + " in " + save_path)
        torch.save(emb_all, save_path + "emb_" + str(k) + ".pt")
        torch.save(lactivation_all, save_path + "lactivation_" + str(k) + ".pt")
        with open(save_path + "ids_" + str(k) + ".json", "w") as fp:
            json.dump(ids, fp)

        emb = []
        lactivation = []
        ids = []
        all_inputs = []

    if im.mode != "L":
        inputs = feature_extractor(images=im, return_tensors="pt")
        # all_inputs.append(inputs["pixel_values"])
        #  emb.append(model(inputs["pixel_values"]))
        emb.append(model(inputs["pixel_values"]))
        lactivation.append(activation["norm"])
        ids.append(ui)

## Call get embeddings

In [None]:
impath = "data/VG_100K/"

compute_save_embeddings(
    all_ids,
    impath,
    dinov2_vitb14,
    feature_extractor,
    save_counter=10,
    save_path="output/test_",
)

# Generate train/test dataset

In [None]:
def get_im_ids(vaw_att, all_attrl, all_ids, all_pa, vk):
    sel_image_ids = []
    sel_image_lab = []
    sel_image_idsi = []

    sel_attr_ids = [all_attrl.index(sid) for sid in vaw_att[vk]]
    for k, sai in enumerate(sel_attr_ids):
        selimi = np.nonzero(all_pa[:, sai])[0]
        sel_image_ids.append([all_ids[si] for si in selimi])
        sel_image_idsi.append(selimi)
        sel_image_lab.append(k * np.ones((len(selimi))))

    fsel_image_ids = [item for sublist in sel_image_ids for item in sublist]
    fsel_image_lab = [item for sublist in sel_image_lab for item in sublist]
    fsel_image_idsi = [item for sublist in sel_image_idsi for item in sublist]

    return fsel_image_ids, fsel_image_lab, fsel_image_idsi


def get_train_data(emb_all, fsel_image_ids, fall_emb_ids, fsel_image_lab):

    # Get the index for the images from the selected attribute group
    train_emb_ids = []
    train_emb = []
    train_labs = []
    lab_len = int(np.max(np.array(fsel_image_lab))) + 1
    for fii, lab in zip(fsel_image_ids, fsel_image_lab):
        if fii in fall_emb_ids:
            train_emb_ids.append(fall_emb_ids.index(fii))
            train_emb.append(emb_all[fall_emb_ids.index(fii), :, :])
            Y = torch.zeros((lab_len))
            Y[int(lab)] = 1
            train_labs.append(Y)

    train_emb = torch.stack(train_emb).squeeze()
    train_labs = torch.stack(train_labs)

    return train_emb, train_labs

def get_attributes(vaw_att, train_json_all):
    total_attr = 0
    all_attrl = []
    for vk in vaw_att.keys():
      total_attr += len(vaw_att[vk])
      all_attrl.append(vaw_att[vk])
      # print(vk, len(vaw_att[vk]), vaw_att[vk][:10])
    all_attr = [item for sublist in all_attrl for item in sublist]

    missing_attrs = []
    for jd in train_json_all:
        for pa in jd['positive_attributes']:
            if pa not in all_attr:
                missing_attrs.append(pa)

    all_attrl = all_attr + missing_attrs 
    print(total_attr, len(all_attrl)) #, len(set(all_attr)))
    return all_attrl, total_attr



def get_attributes_negative(vaw_att, train_json_all):
    total_attr = 0
    all_attrl = []
    for vk in vaw_att.keys():
      total_attr += len(vaw_att[vk])
      all_attrl.append(vaw_att[vk])
      # print(vk, len(vaw_att[vk]), vaw_att[vk][:10])
    all_attr = [item for sublist in all_attrl for item in sublist]

    missing_attrs = []
    for jd in train_json_all:
        for pa in jd['negative_attributes']:
            if pa not in all_attr:
                missing_attrs.append(pa)

    all_attrl = all_attr + missing_attrs 
    print(total_attr, len(all_attrl)) #, len(set(all_attr)))
    return all_attrl, total_attr


## Combine embeddings to single variable