## How I created an animation  of the embeddings during fine-tuning
### Using Cleanlab, PCA, and Procrustes to visualize ViT fine-tuning on CIFAR-10
This notebook is part of an [article at Towards AI.](https://pub.towardsai.net/how-i-created-an-animation-of-the-embeddings-during-fine-tuning-2b8bdf49f822)

Outline:
- Preparation: Fine-tuning
- Create Embeddings
- Definitions of functions for Outliers, PCA and Procrustes
- Review in Spotlight
- Create the animation


## Preparation: Fine-tuning
The fine-tuning was created based on https://huggingface.co/docs/transformers/tasks/image_classification
- Load dataset CIFAR10
- Load google/vit-base-patch16-224-in21k
- Fine-tune the model on CIFAR10
- Store a checkpoint of the fine-tuned model for each frame of the animation

In [None]:

# Install required libraries
!pip install transformers[torch] datasets pandas pillow cleanlab scipy matplotlib imageio renumics-spotlight

In [None]:
# Load cifar10 dataset
from datasets import load_dataset

ds = load_dataset("cifar10")

In [None]:
# Creating the label2id and id2label dicts mapping labels to index values and vice versa.
labels = ds["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
# Load an image preprocessor for the model. This will resize the images to the correct size for the model.
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [None]:
# Take a list of PIL images and turn them to pixel values
def transform(example_batch):
    inputs = image_processor(
        [x.convert("RGB") for x in example_batch["img"]], return_tensors="pt"
    )
    inputs["label"] = example_batch["label"]
    return inputs


# Apply transform on the dataset
prepared_ds = ds.with_transform(transform)

In [None]:
# Use a data collator to create batches
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
# Load the pre-trained model with AutoModelForImageClassification. Specify number of labels and the label mappings.
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# We use an additional callback to write the loss values of time into a .csv file.
from transformers import TrainerCallback


class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            if len(logs) == 3:  # skip last row
                with open("log.csv", "a") as f:
                    f.write(",".join(map(str, logs.values())) + "\n")

In [None]:
# Setup training parameters: Choose a low save_step interval for more frames in the animation
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    f"vit-base-patch16-224-in21k-ft-cifar10_highres_train",
    remove_unused_columns=False,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=20,  # the movie will be created by checkpoint save in this interval. Lower values increase the number of frames
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    num_train_epochs=0.04,  # use 0.04 for testing with a few frames. Use higher values for longer movies
    warmup_ratio=0.1,
    logging_steps=20,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [None]:
# Instantiate the Trainer object and start the training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=image_processor,
    callbacks=[PrinterCallback],
)

# Train and save results
train_results = trainer.train()

## Create embeddings


In [None]:
# define functions to create embeddings from an individual checkpoint
# based on https://renumics.com/next/docs/playbook/huggingface-embedding
import datasets
from transformers import AutoFeatureExtractor, AutoModel
import torch
import pandas as pd


def extract_embeddings(model, feature_extractor, image_name="image"):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        images = batch["image"]
        inputs = feature_extractor(images=images, return_tensors="pt").to(device)
        embeddings = model(**inputs).last_hidden_state[:, 0].cpu()

        return {"embedding": embeddings}

    return pp


def huggingface_embedding(
    df,
    image_name="image",
    inplace=False,
    modelname="google/vit-base-patch16-224",
    batched=True,
    batch_size=24,
):
    # initialize huggingface model
    feature_extractor = AutoFeatureExtractor.from_pretrained(modelname)
    model = AutoModel.from_pretrained(modelname, output_hidden_states=True)

    # create huggingface dataset from df
    dataset = datasets.Dataset.from_pandas(df).cast_column(image_name, datasets.Image())

    # compute embedding
    device = "cuda" if torch.cuda.is_available() else "cpu"
    extract_fn = extract_embeddings(model.to(device), feature_extractor, image_name)
    updated_dataset = dataset.map(extract_fn, batched=batched, batch_size=batch_size)

    df_temp = updated_dataset.to_pandas()

    if inplace:
        df["embedding"] = df_temp["embedding"]
        return

    df_emb = pd.DataFrame()
    df_emb["embedding"] = df_temp["embedding"]

    return df_emb

In [None]:
# Load CIFAR-100 from Huggingface hub and convert it to a Pandas DataFrame
import datasets

ds = datasets.load_dataset("cifar10", split="test").prepare_for_task(
    "image-classification"
)
df = ds.to_pandas()

In [None]:
# define function to get all available checkpoints as sorted folders
import os
import datasets
import time


def get_sorted_checkpoint_folders():
    # list all subfolders of 'renumics/vit-base-patch16-224-in21k-ft-cifar10' that have checkpoint in the name
    checkpoint_folders = [
        x
        for x in os.listdir("vit-base-patch16-224-in21k-ft-cifar10_highres_train")
        if "checkpoint" in x
    ]

    # sort the list of folders
    sorted_checkpoint_folders = sorted(
        checkpoint_folders, key=lambda x: int(x.split("-")[-1])
    )
    sorted_checkpoint_folders = [
        "vit-base-patch16-224-in21k-ft-cifar10_highres_train" + "/" + x
        for x in sorted_checkpoint_folders
    ]
    return sorted_checkpoint_folders

In [None]:
# create embeddings for each checkpoint and store them in the same folder
for sorted_checkpoint_folder in get_sorted_checkpoint_folders():
    # check if embedding already exists
    if os.path.exists(sorted_checkpoint_folder + "/embedding.pkl"):
        continue
    embedding = huggingface_embedding(
        df, modelname=sorted_checkpoint_folder, image_name="image"
    )["embedding"]
    # store in same folder
    embedding.to_pickle(sorted_checkpoint_folder + "/embedding.pkl")

## Definitions of functions for Outliers, PCA and Procrustes

In [None]:
# define function to extract outliers based on embeddings using cleanlab
import io
from PIL import Image
import numpy as np
import pandas as pd

from cleanlab.outlier import OutOfDistribution


def get_ood(sorted_checkpoint_folder, df):
    embedding = pd.read_pickle(sorted_checkpoint_folder + "/embedding.pkl").to_list()
    embedding_np = np.array(embedding)

    ood = OutOfDistribution()
    ood_train_feature_scores = ood.fit_score(features=embedding_np)
    df["scores"] = ood_train_feature_scores

    # select row with the lowest 8 scores
    df_ood = df.sort_values(by=["scores"], ascending=True).head(8)
    # load the 8 corresponding images
    ood_images = [
        (Image.open(io.BytesIO(x["bytes"])).convert("RGB"), l)
        for x, l in df_ood[["image", "labels"]].to_numpy()
    ]
    return ood_images

In [None]:
# Define function to generate a PCA, use procrustes to transfor the created points to given input if provided.
# Procrustes can be used to flip, rotate and scale the points of the new frame on the old frame to stabilize the movie.
from sklearn.decomposition import PCA
from scipy.spatial import procrustes


def make_pca(sorted_checkpoint_folder, pca_np):
    embedding = pd.read_pickle(sorted_checkpoint_folder + "/embedding.pkl").to_list()
    embedding_np = np.array(embedding)
    embedding_np_flat = embedding_np.reshape(-1, 768)

    pca = PCA(n_components=2)
    pca_np_new = pca.fit_transform(embedding_np_flat)

    if pca_np is None:
        pca_np = pca_np_new

    _, pca_np_new, disparity = procrustes(pca_np, pca_np_new)
    pca_np = pca_np_new

    # scale pca_np_new to be in range [-5, 5]
    pca_np_disp = pca_np_new * 5 / np.max(np.abs(pca_np_new))
    return pca_np_disp

# Review in Spotlight
- load the first and latest checkpoint
- generate embeddings, outliers and PCA
- visualize in spotlight

In [None]:
# load embeddings and extract outliers for first and last checkpoint and store them in the dataframe
first, last = get_sorted_checkpoint_folders()[0], get_sorted_checkpoint_folders()[-1]
df_ood_images_first = get_ood(first, df)
df["scores_first"] = df["scores"]
df_ood_images_last = get_ood(last, df)
df["scores_last"] = df["scores"]
del df["scores"]

In [None]:
# apply PCA to embeddings of first and last checkpoint and store them in the dataframe
df["pca_first"] = pca_np_disp_first = make_pca(first, None).tolist()
df["pca_last"] = pca_np_disp_last = make_pca(last, pca_np_disp_first).tolist()

In [None]:
# add label_str column to dataframe
df["label_str"] = df["labels"].apply(lambda x: ds.features["labels"].int2str(x))
df

In [None]:
# visualize embeddings and outliers of first and last checkpoint with spotlight
from renumics import spotlight

spotlight.show(
    df,
    dtype={
        "image": spotlight.Image,
        "pca_first": spotlight.Embedding,
        "pca_last": spotlight.Embedding,
    },
    layout="https://spotlight.renumics.com/resources/embeddings_pca.json",
)

## Create the animation

In [None]:
# Load loss from csv file into loss_df
import pandas as pd

loss_df = pd.read_csv("log.csv", names=["loss", "learning_rate", "epoch"])
loss_df

In [None]:
# create an image for each checkpoint and store it next to the checkpoint
import matplotlib.pyplot as plt
import tqdm


fig = plt.figure(figsize=(8, 8), dpi=200)
all_labels = ds.to_pandas()["labels"]
pca_np_disp = None
for i, sorted_checkpoint_folder in tqdm.tqdm(
    enumerate(get_sorted_checkpoint_folders())
):
    df_ood_images = get_ood(sorted_checkpoint_folder, df)
    pca_np_disp = make_pca(sorted_checkpoint_folder, pca_np_disp)

    # prepare figure
    fig.clf()
    a0, a1 = fig.subplots(2, 1, gridspec_kw={"height_ratios": [5, 1], "hspace": 0.4})
    _ = fig.suptitle(
        "Fine Tuning Training Step " + str(i * 2) + " of a Vision Transformer (ViT)"
    )

    # setup subplot of pca points
    a0.set_aspect("equal", adjustable="box")
    a0.set_xlim(-5, 5)
    a0.set_ylim(-5, 5)
    _ = a0.set_xlabel("pca 1")
    _ = a0.set_ylabel("pca 2")
    _ = a0.set_title("PCA of embedding space")

    # add a scatter plot one by one for each label
    for k in range(10):
        mask = all_labels == k
        _ = a0.scatter(pca_np_disp[mask, 0], pca_np_disp[mask, 1])
    a0.legend(
        labels=[ds.features["labels"].int2str(x) for x in range(10)], loc="upper right"
    )

    # setup subplot for loss
    _ = a1.set_ylim(0, 3)
    _ = a1.set_xlim(0, max(loss_df["epoch"]))
    _ = a1.set_xlabel("step")
    _ = a1.set_ylabel("loss")
    _ = a1.set_title("Training loss")

    # plot loss
    loss = loss_df["loss"].copy()
    if i + 1 < len(loss):
        loss[i + 1 :] = np.nan
    _ = a1.plot(loss_df["epoch"], loss, c="r")

    # add outlier images
    for j, (img, l) in enumerate(df_ood_images):
        newax = fig.add_axes([0.85, 0.87 - (j / 11), 0.06, 0.07], anchor="NE", zorder=1)
        newax.imshow(img)
        newax.axis("off")
        newax.set_aspect("equal", "box")
        newax.set_title("Outlier " + str(j) + f" ({ds.features['labels'].int2str(l)})")

    plt.savefig(sorted_checkpoint_folder + "/pca_dyn_procrustes_300_outlow.png")

In [None]:
import glob
import re
import imageio

# get all images from candidates
img_paths = []
for sorted_checkpoint_folder in get_sorted_checkpoint_folders():
    img_paths += glob.glob(
        sorted_checkpoint_folder + "/pca_dyn_procrustes_300_outlow.png"
    )
# sort images by number
img_paths = sorted(img_paths, key=lambda x: int(re.findall(r"\d+", x)[0]))


with imageio.get_writer(
    "pca_dyn_procrustes_300_outlow.gif", mode="I", loop=0
) as writer:
    for filename in img_paths:
        image = imageio.imread(filename)
        # crop whitespace in image
        image = image[10:-100, 110:-10]

        writer.append_data(image)

[article at Towards AI](https://pub.towardsai.net/how-i-created-an-animation-of-the-embeddings-during-fine-tuning-2b8bdf49f822)Checkout the  for more details.