# Image Embeddings with HuggingFace

Python Imports

In [63]:
import torch
from transformers import AutoImageProcessor, ViTMAEModel, PreTrainedModel
from PIL import Image
from torchvision.datasets.folder import ImageFolder
from datasets import load_dataset, Dataset
import requests
import glob
import os
import numpy as np

Import Images

In [64]:
image_dataset = load_dataset("imagefolder", data_dir="data/")

# Remove train/test segmentation
image_dataset = image_dataset['train']

Found cached dataset imagefolder (/Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-073897f9ede2efb8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/1 [00:00<?, ?it/s]

Load the image processor and model

In [65]:
image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")

Some weights of the model checkpoint at facebook/vit-mae-base were not used when initializing ViTMAEModel: ['decoder.decoder_layers.3.layernorm_before.weight', 'decoder.decoder_layers.1.intermediate.dense.weight', 'decoder.decoder_layers.3.attention.attention.key.weight', 'decoder.decoder_layers.3.intermediate.dense.weight', 'decoder.decoder_layers.0.output.dense.weight', 'decoder.decoder_layers.5.layernorm_before.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.decoder_layers.5.output.dense.bias', 'decoder.decoder_layers.1.layernorm_after.bias', 'decoder.decoder_layers.3.attention.attention.value.weight', 'decoder.decoder_norm.weight', 'decoder.decoder_layers.1.attention.attention.key.weight', 'decoder.decoder_layers.3.attention.attention.value.bias', 'decoder.decoder_layers.5.attention.attention.value.weight', 'decoder.decoder_layers.4.layernorm_before.weight', 'decoder.decoder_layers.1.intermediate.dense.bias', 'decoder.decoder_layers.7.layernorm_after.we

Preprocess images

In [66]:
def preprocess_image(img: Image, image_processor: AutoImageProcessor) -> torch.tensor:
    """Preprocess image using a HuggingFace auto image processor.

    Args:
        img (Image): Pillow image
        image_processor (AutoImageProcessor): HuggingFace image processor

    Returns:
        torch.tensor: Preprocessed image as a Torch tensor
    """
    # Convert image to RGB if it is not already.
    img = img.convert("RGB")
    
    return image_processor(images = img, return_tensors = "pt")

In [67]:
# Process images using HuggingFace processor
image_dataset = image_dataset.map(lambda x: {"preprocessed_image": preprocess_image(x['image'], image_processor=image_processor)})

# Set dataset format to PyTorch
image_dataset = image_dataset.with_format("pt")

Loading cached processed dataset at /Users/christophernorman/.cache/huggingface/datasets/imagefolder/default-073897f9ede2efb8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-7c14fc7d2ae45fdd.arrow


Pass the images through the model and extract embeddings

In [68]:
def create_embedding(model, preprocessed_image: torch.Tensor) -> torch.Tensor:
    """Passes a preprocessed image through a pretrained embedding model.

    Args:
        model (PreTrainedModel): Pretrained HuggingFace PyTorch embedding model.
        preprocessed_image (torch.Tensor): Preprocessed image as a PyTorch Tensor

    Returns:
        torch.Tensor: Embedding vector shape (1, 768) as a Tensor
    """
    embedding = model(**preprocessed_image).last_hidden_state[:, 0]
    return np.squeeze(embedding)

In [69]:
image_dataset = image_dataset.map(lambda img: {"embedding": create_embedding(model, img["preprocessed_image"])})
image_dataset

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Dataset({
    features: ['image', 'name', 'preprocessed_image', 'embedding'],
    num_rows: 9
})

In [70]:
image_dataset['embedding']

tensor([[-1.4462e-01,  1.1187e-01,  2.2245e-01,  ..., -1.2107e-01,
         -1.3522e-01,  1.4581e-01],
        [-2.2478e-01,  2.7582e-02,  1.3079e-01,  ..., -7.0743e-02,
         -2.3576e-01,  1.9288e-01],
        [-1.3256e-01,  1.1413e-01,  9.1772e-02,  ..., -1.6287e-01,
         -2.9986e-01,  3.3026e-01],
        ...,
        [-2.0087e-01,  9.5759e-02,  1.0681e-01,  ..., -7.0807e-02,
         -1.5886e-01,  1.4101e-01],
        [-6.5135e-02, -1.2994e-05,  9.4249e-02,  ..., -1.3041e-01,
         -2.4108e-01,  1.0788e-01],
        [-1.5037e-01, -1.5001e-02,  5.8316e-02,  ..., -2.4850e-01,
         -1.5772e-01,  7.6791e-02]])

## Visualisation

Export data to TSV file and metadata for visualisation

In [71]:
def save_embeddings_to_tsv(image_dataset: Dataset):
    """Saves embeddings as tab seperated values to be used at https://projector.tensorflow.org/ .

    Args:
        image_dataset (datasets.Dataset): Dataset containing an embedding and a name column.
    """
    # Save embedding values seperately
    with open("embeddings.tsv", "w") as file:
        for row in image_dataset:
            for i in row['embedding'].numpy():
                file.write(f"{i}\t")
            file.write("\n")

    # Save metadata
    with open("metadata.tsv", "w") as file:
        for row in image_dataset:
            file.write(f"{row['name']}\n")

In [72]:
save_embeddings_to_tsv(image_dataset)

Upload the saved csv files into the Embedding Projector here: https://projector.tensorflow.org/