# Image Embeddings with HuggingFace

Python Imports

In [1]:
import torch
from transformers import AutoImageProcessor, ViTMAEModel
from PIL import Image
from torchvision.datasets.folder import ImageFolder
from datasets import load_dataset, Dataset
import requests
import glob
import os
import numpy as np

Import Images

In [2]:
image_dataset = load_dataset("imagefolder", data_dir="data/")

# Remove train/test segmentation
image_dataset = image_dataset['train']

Found cached dataset imagefolder (/Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-02cc048519d4f028/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/1 [00:00<?, ?it/s]

Load the image processor and model

In [3]:
image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")

Some weights of the model checkpoint at facebook/vit-mae-base were not used when initializing ViTMAEModel: ['decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_layers.0.attention.attention.query.weight', 'decoder.decoder_layers.1.attention.attention.query.bias', 'decoder.decoder_pos_embed', 'decoder.decoder_layers.1.attention.attention.key.bias', 'decoder.decoder_layers.4.attention.attention.key.weight', 'decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.5.layernorm_after.bias', 'decoder.decoder_layers.1.layernorm_before.bias', 'decoder.decoder_layers.4.attention.attention.query.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.decoder_layers.3.output.dense.bias', 'decoder.decoder_layers.3.output.dense.weight', 'decoder.decoder_layers.0.layernorm_before.bias', 'decoder.decoder_layers.4.intermediate.dense.bias', 'decoder.decoder_layers.0.attention.output.dense.weight', 'decoder.decoder_layers.0.layernorm_befor

Preprocess images

In [4]:
# Convert images to RGB if they are not already
image_dataset = image_dataset.map(lambda img: {"image": img['image'].convert("RGB")})

# Process images using HuggingFace processor
image_dataset = image_dataset.map(lambda img: {"preprocessed_image": image_processor(images=img['image'], return_tensors="pt")})

# Set dataset format to PyTorch
image_dataset = image_dataset.with_format("pt")

Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-02cc048519d4f028/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-ee8ae1de39ca03ec.arrow
Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-02cc048519d4f028/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-399c3bc651318e73.arrow


Pass the images through the model and extract embeddings

In [5]:
image_dataset = image_dataset.map(lambda img: {"embedding": np.squeeze(model(**img["preprocessed_image"]).last_hidden_state[:, 0])})
image_dataset

Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-02cc048519d4f028/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-1f30272df30f3065.arrow


Dataset({
    features: ['image', 'name', 'preprocessed_image', 'embedding'],
    num_rows: 9
})

In [6]:
image_dataset['embedding']

tensor([[-0.1671,  0.0832,  0.2001,  ..., -0.0491, -0.0986,  0.1635],
        [-0.2413, -0.0054,  0.1378,  ..., -0.1053, -0.2717,  0.2162],
        [-0.1123,  0.1328,  0.1023,  ..., -0.1467, -0.3044,  0.3309],
        ...,
        [-0.1927,  0.1057,  0.1150,  ..., -0.0388, -0.1760,  0.1588],
        [-0.1013, -0.0240,  0.1162,  ..., -0.0916, -0.2549,  0.0660],
        [-0.1493, -0.0404,  0.0695,  ..., -0.2460, -0.1252,  0.0750]])

## Visualisation

Export data to TSV file and metadata for visualisation

In [7]:
def save_embeddings_to_tsv(image_dataset: Dataset):
    """Saves embeddings as tab seperated values to be used at https://projector.tensorflow.org/ .

    Args:
        image_dataset (datasets.Dataset): Dataset containing an embedding and a name column.
    """
    # Save embedding values seperately
    with open("embeddings.tsv", "w") as file:
        for row in image_dataset:
            for i in row['embedding'].numpy():
                file.write(f"{i}\t")
            file.write("\n")

    # Save metadata
    with open("metadata.tsv", "w") as file:
        for row in image_dataset:
            file.write(f"{row['name']}\n")

In [8]:
save_embeddings_to_tsv(image_dataset)

Upload the saved csv files into the Embedding Projector here: https://projector.tensorflow.org/