# Create embeddings with the transformer library

We use the Huggingface transformers library to create an embedding for an audio dataset. For more data-centric AI workflows, check out our [Awesome Open Data-centric AI](https://github.com/Renumics/awesome-open-data-centric-ai) list on Github







## tldr; Play as callable functions

Install required packages with PIP

In [None]:
!pip install renumics-spotlight transformers torch datasets umap-learn numpy

Play as copy-and-paste functions

In [None]:
import datasets
from transformers import AutoFeatureExtractor, AutoModel, ASTForAudioClassification
import torch
from renumics import spotlight
import pandas as pd
import umap
import numpy as np
import requests
import json

def __set_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"   
    if device == "cuda":
        torch.cuda.empty_cache()
    return device


def extract_embeddings(model, feature_extractor):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        audios = [element["array"] for element in batch["audio"]]
        inputs = feature_extractor(raw_speech=audios, return_tensors="pt", padding=True).to(device)        
        embeddings = model(**inputs).last_hidden_state[:, 0].cpu()
        
        return {"embedding": embeddings}
        

    return pp


def huggingface_embedding(dataset, modelname, batched=True, batch_size=8):
    # initialize huggingface model
    feature_extractor = AutoFeatureExtractor.from_pretrained(modelname, padding=True)
    model = AutoModel.from_pretrained(modelname, output_hidden_states=True)

    #compute embedding  
    device = __set_device()
    extract_fn = extract_embeddings(model.to(device), feature_extractor)
    updated_dataset = dataset.map(extract_fn, batched=batched, batch_size=batch_size)
    
    return updated_dataset



## Step-by-step example on speech-commands

### Load speech-commands from Huggingface hub

Map enrichment on subset

In [None]:
import random

dataset = datasets.load_dataset('speech_commands', 'v0.01', split="all")
labels = dataset.features["label"].names
num_rows = dataset.num_rows

In [None]:
subset_dataset = dataset.select([random.randint(0, num_rows) for i in range(100)])

Let's have a look at all of the labels that we want to predict

In [None]:
print(labels)

### Compute embedding with audio transformer from Huggingface

In [None]:
dataset_enriched = huggingface_embedding(subset_dataset, "MIT/ast-finetuned-speech-commands-v2")

### Reduce embeddings for faster visualization

In [None]:
embeddings = np.stack(np.array(dataset_enriched['embedding']))
reducer = umap.UMAP()
reduced_embedding = reducer.fit_transform(embeddings)
dataset_enriched = dataset_enriched.add_column("embedding_reduced", list(reduced_embedding))

### Perform EDA with Spotlight

In [None]:
df = dataset_enriched.to_pandas()

In [None]:
df.head(10)

In [None]:
df_show = df.drop(columns=["embedding"])

### Perform EDA with Spotlight

> ⚠️ Running Spotlight in Colab currently has severe limitations (slow, no similarity map, no layouts) due to Colab restrictions (e.g. no websocket support). Run the notebook locally for the full Spotlight experience

In [None]:
# handle google colab differently
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    # visualization in Google Colab only works in chrome and does not support websockets, we need some hacks to visualize something
    df_show["embx"] = [emb[0] for emb in df_show["embedding_reduced"]]
    df_show["emby"] = [emb[1] for emb in df_show["embedding_reduced"]]
    port = 50123
    layout_url = "https://raw.githubusercontent.com/Renumics/spotlight/main/playbook/rookie/embedding_layout_colab.json"
    response = requests.get(layout_url)
    layout = spotlight.layout.nodes.Layout(**json.loads(response.text))
    spotlight.show(df_show, port=port, dtype={"audio": spotlight.Audio}, layout=layout)
    from google.colab.output import eval_js  # type: ignore

    print(str(eval_js(f"google.colab.kernel.proxyPort({port}, {{'cache': true}})")))

else:
    layout_url = "https://raw.githubusercontent.com/Renumics/spotlight/main/playbook/rookie/embedding_layout.json"
    response = requests.get(layout_url)
    layout = spotlight.layout.nodes.Layout(**json.loads(response.text))
    spotlight.show(
        df_show,
        dtype={"audio": spotlight.Audio, "embedding_reduced": spotlight.Embedding},
        layout=layout,
    )

### Optional: Save enriched dataframe to disk

In [None]:
#dataset_enriched.to_parquet('dataset_audio_annotated_and_embedding.parquet.gzip', compression='gzip')