# Image Embeddings with HuggingFace

Python Imports

In [86]:
import torch
from transformers import AutoImageProcessor, ViTMAEModel, PreTrainedModel
from PIL import Image
from torchvision.datasets.folder import ImageFolder
from datasets import load_dataset, Dataset
import requests
import glob
import os
import numpy as np

Import Images

In [87]:
image_dataset = load_dataset("imagefolder", data_dir="data/")

# Remove train/test segmentation
image_dataset = image_dataset['train']

Found cached dataset imagefolder (/Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-073897f9ede2efb8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/1 [00:00<?, ?it/s]

Load the image processor and model

In [88]:
image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")

Some weights of the model checkpoint at facebook/vit-mae-base were not used when initializing ViTMAEModel: ['decoder.decoder_layers.6.layernorm_after.bias', 'decoder.decoder_layers.0.attention.attention.value.bias', 'decoder.decoder_layers.1.layernorm_before.weight', 'decoder.decoder_layers.7.layernorm_after.bias', 'decoder.decoder_layers.1.layernorm_before.bias', 'decoder.decoder_layers.6.intermediate.dense.bias', 'decoder.decoder_layers.5.attention.attention.key.bias', 'decoder.decoder_layers.5.attention.output.dense.bias', 'decoder.decoder_layers.6.output.dense.weight', 'decoder.decoder_layers.2.attention.attention.key.bias', 'decoder.decoder_layers.5.layernorm_after.weight', 'decoder.decoder_layers.0.attention.attention.key.bias', 'decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_layers.5.attention.attention.key.weight', 'decoder.decoder_layers.5.output.dense.bias', 'decoder.decoder_layers.6.attention.attention.value.weight', 'decoder.decoder_layers.7.ou

Preprocess images

In [89]:
def preprocess_image(img: Image, image_processor: AutoImageProcessor) -> torch.tensor:
    """Preprocess image using a HuggingFace auto image processor.

    Args:
        img (Image): Pillow image
        image_processor (AutoImageProcessor): HuggingFace image processor

    Returns:
        torch.tensor: Preprocessed image as a Torch tensor
    """
    # Convert image to RGB if it is not already.
    img = img.convert("RGB")
    
    return image_processor(images = img, return_tensors = "pt")

In [90]:
# Process images using HuggingFace processor
image_dataset = image_dataset.map(lambda x: {"preprocessed_image": preprocess_image(x['image'], image_processor=image_processor)})

# Set dataset format to PyTorch
image_dataset = image_dataset.with_format("pt")

Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-073897f9ede2efb8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-7c14fc7d2ae45fdd.arrow


Pass the images through the model and extract embeddings

In [91]:
def create_embedding(model, preprocessed_image: torch.Tensor) -> torch.Tensor:
    """Passes a preprocessed image through a pretrained embedding model.

    Args:
        model (PreTrainedModel): Pretrained HuggingFace PyTorch embedding model.
        preprocessed_image (torch.Tensor): Preprocessed image as a PyTorch Tensor

    Returns:
        torch.Tensor: Embedding vector shape (1, 768) as a Tensor
    """
    embedding = model(**preprocessed_image).last_hidden_state[:, 0]

    return np.squeeze(embedding)

In [92]:
image_dataset = image_dataset.map(lambda img: {"embedding": create_embedding(model, img["preprocessed_image"])})
image_dataset

Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-073897f9ede2efb8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-b22d65f36126e755.arrow


Dataset({
    features: ['image', 'name', 'preprocessed_image', 'embedding'],
    num_rows: 9
})

Print structure of an embedding

In [93]:
image_dataset[0]['embedding']

tensor([-1.7841e-01,  7.8200e-02, -2.8591e-01,  1.2317e-01, -9.1169e-01,
        -9.8419e-01,  1.6852e-01,  1.8547e-01,  2.1094e-01, -3.4289e-01,
        -4.1250e-02,  8.2595e-02,  3.2813e-01,  6.8463e-03,  2.3943e-03,
         1.2862e-02,  8.1512e-02, -2.3367e-01, -1.6421e-01,  4.2300e+00,
         1.4997e-01,  9.9038e-02, -3.6799e-01, -8.9632e-03, -7.6403e-02,
        -2.1601e-01, -4.9117e-02,  3.2531e-02, -1.7313e-01, -1.9451e-01,
         1.1379e-01, -2.0484e-02,  1.9072e-01,  1.7569e-01, -1.2619e-01,
         3.4783e-02,  5.0942e-04,  6.6755e-02, -2.2290e-01, -2.6659e-01,
         1.0214e-01, -6.0784e-02, -1.8836e-01, -3.4465e-01, -2.2797e-01,
        -5.6797e-01,  1.1735e-01, -1.0172e-03, -4.0231e-01,  8.1782e-02,
        -7.9657e-02, -3.0384e-02,  1.6060e-01, -1.1649e-01, -3.5328e-01,
         6.8403e-02, -1.8808e-01,  1.1057e-01,  1.2938e-01, -8.5726e-02,
         2.0943e-01, -2.3959e-01,  3.0581e-02,  3.4867e-01, -2.1898e-01,
         8.4451e-02, -1.6469e-01, -1.4552e-01, -7.3

In [94]:
image_dataset[0]['embedding'].shape

torch.Size([768])

## Visualisation

Export data to TSV file and metadata for visualisation

In [95]:
def save_embeddings_to_tsv(image_dataset: Dataset):
    """Saves embeddings as tab seperated values to be used at https://projector.tensorflow.org/ .

    Args:
        image_dataset (datasets.Dataset): Dataset containing an embedding and a name column.
    """
    # Save embedding values seperately
    with open("embeddings.tsv", "w") as file:
        for row in image_dataset:
            for i in row['embedding'].numpy():
                file.write(f"{i}\t")
            file.write("\n")

    # Save metadata
    with open("metadata.tsv", "w") as file:
        for row in image_dataset:
            file.write(f"{row['name']}\n")

In [96]:
save_embeddings_to_tsv(image_dataset)

Upload the saved csv files into the Embedding Projector here: https://projector.tensorflow.org/