# Create image embeddings with Huggingface

We use the [Huggingface transformers library](https://github.com/huggingface/transformers) to create an embedding for a an image dataset. 

More information about this play can be found in the Spotlight documentation: [Create image embeddings with the Huggingface transformer library](https://renumics.com/docs/playbook/huggingface-embedding)

For more data-centric AI workflows, check out our [Awesome Open Data-centric AI](https://github.com/Renumics/awesome-open-data-centric-ai) list on Github.


## tldr

In [None]:
# @title Install required packages with PIP

!pip install renumics-spotlight transformers torch datasets

In [None]:
# @title Play as copy-n-paste functions

import datasets
from transformers import AutoFeatureExtractor, AutoModel
import torch
from renumics import spotlight
import pandas as pd
import requests
import json


def extract_embeddings(model, feature_extractor, image_name="image"):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        images = batch["image"]
        inputs = feature_extractor(images=images, return_tensors="pt").to(device)
        embeddings = model(**inputs).last_hidden_state[:, 0].cpu()

        return {"embedding": embeddings}

    return pp


def huggingface_embedding(
    df,
    image_name="image",
    inplace=False,
    modelname="google/vit-base-patch16-224",
    batched=True,
    batch_size=24,
):
    # initialize huggingface model
    feature_extractor = AutoFeatureExtractor.from_pretrained(modelname)
    model = AutoModel.from_pretrained(modelname, output_hidden_states=True)

    # create huggingface dataset from df
    dataset = datasets.Dataset.from_pandas(df).cast_column(image_name, datasets.Image())

    # compute embedding
    device = "cuda" if torch.cuda.is_available() else "cpu"
    extract_fn = extract_embeddings(model.to(device), feature_extractor, image_name)
    updated_dataset = dataset.map(extract_fn, batched=batched, batch_size=batch_size)

    df_temp = updated_dataset.to_pandas()

    if inplace:
        df["embedding"] = df_temp["embedding"]
        return

    df_emb = pd.DataFrame()
    df_emb["embedding"] = df_temp["embedding"]

    return df_emb

## Step-by-step example on CIFAR-100

### Load CIFAR-100 from Huggingface hub and convert it to Pandas dataframe

In [None]:
dataset = datasets.load_dataset("renumics/cifar100-enriched", split="train")
df = dataset.to_pandas()

### Compute embedding with vision transformer from Huggingface

In [None]:
# we only use a subset in this example in order to keep computation times low
no_samples = 100
df = df[:100]
df.drop(columns=["embedding", "embedding_reduced"], inplace=True)

df_emb = huggingface_embedding(df, modelname="google/vit-base-patch16-224")
df = pd.concat([df, df_emb], axis=1)

### Reduce embeddings for faster visualization

In [None]:
import umap
import numpy as np

embeddings = np.stack(df["embedding"].to_numpy())
print(embeddings.shape)
reducer = umap.UMAP()
reduced_embedding = reducer.fit_transform(embeddings)
df["embedding_reduced"] = np.array(reduced_embedding).tolist()

### Perform EDA with Spotlight

> ⚠️ Running Spotlight in Colab currently has severe limitations (slow, no similarity map, no layouts) due to Colab restrictions (e.g. no websocket support). Run the notebook locally for the full Spotlight experience

In [None]:
df_show = df.drop(columns=["embedding", "probabilities"])


# handle google colab differently
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    # visualization in Google Colab only works in chrome and does not support websockets, we need some hacks to visualize something
    df_show["embx"] = [emb[0] for emb in df_show["embedding_reduced"]]
    df_show["emby"] = [emb[1] for emb in df_show["embedding_reduced"]]
    port = 50123
    layout_url = "https://raw.githubusercontent.com/Renumics/spotlight/main/playbook/rookie/embedding_layout_colab.json"
    response = requests.get(layout_url)
    layout = spotlight.layout.nodes.Layout(**json.loads(response.text))
    spotlight.show(df_show, port=port, dtype={"image": spotlight.Image}, layout=layout)
    from google.colab.output import eval_js  # type: ignore

    print(str(eval_js(f"google.colab.kernel.proxyPort({port}, {{'cache': true}})")))

else:
    df_show = df.drop(columns=["embedding", "probabilities"])
    layout_url = "https://raw.githubusercontent.com/Renumics/spotlight/main/playbook/rookie/embedding_layout.json"
    response = requests.get(layout_url)
    layout = spotlight.layout.nodes.Layout(**json.loads(response.text))
    spotlight.show(
        df_show,
        dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding},
        layout=layout,
    )