# ðŸ’¿ Album Cover Vectorization

Goal: Process album cover images into high-dimensional vectors (embeddings) using SigLIP to enable semantic search in the Streamlit app.

In [4]:
%load_ext autoreload
%autoreload 2

## Data

In [5]:
from pathlib import Path

import pandas as pd

In [6]:
data_dir = Path("../../../data/covertartarchive/bm/")

In [7]:
df_bm = pd.read_csv(data_dir / "df_final_with_covers.csv")
df_bm.head()

Unnamed: 0,band,album,year,popularity,genre,album_id,cover_url
0,ACOD,First Earth Poison,2011,0,Thrash/Black Metal (early); Melodic Black/Deat...,336533,https://coverartarchive.org/release/8dc5a614-f...
1,ACOD,II The Maelstrom,2015,1,Thrash/Black Metal (early); Melodic Black/Deat...,608789,https://coverartarchive.org/release/be914f73-6...
2,ACOD,The Divine Triumph,2018,3,Thrash/Black Metal (early); Melodic Black/Deat...,731217,https://coverartarchive.org/release/d3e886d8-d...
3,ACOD,Fourth Reign over Opacities and Beyond,2022,7,Thrash/Black Metal (early); Melodic Black/Deat...,1053210,https://coverartarchive.org/release/a81d6c61-9...
4,ACOD,Versets noirs,2024,9,Thrash/Black Metal (early); Melodic Black/Deat...,1215193,https://coverartarchive.org/release/8dcbe722-0...


In [8]:
df_bm[["band", "album", "cover_url"]].sample(5)

Unnamed: 0,band,album,cover_url
1611,Helheim,Jormundgand,https://coverartarchive.org/release-group/6005...
2957,Satyricon,Deep Calleth upon Deep,https://coverartarchive.org/release/93ef113c-6...
3201,Stworz,Zagony bogÃ³w,https://coverartarchive.org/release/d30e2b21-e...
1506,Graveworm,Diabolical Figures,https://coverartarchive.org/release/154e6697-6...
1001,Dornenreich,Du wilde Liebe sei,https://coverartarchive.org/release/fbbae15f-a...


In [9]:
df_bm["cover_file"] = df_bm["album_id"].apply(lambda x: f"{str(x)[-1]}/{x}.jpg")
df_bm = df_bm.dropna(subset=["cover_url"]).reset_index(drop=True)
img_paths = [data_dir / fname for fname in df_bm["cover_file"]]
df_bm.head()

Unnamed: 0,band,album,year,popularity,genre,album_id,cover_url,cover_file
0,ACOD,First Earth Poison,2011,0,Thrash/Black Metal (early); Melodic Black/Deat...,336533,https://coverartarchive.org/release/8dc5a614-f...,3/336533.jpg
1,ACOD,II The Maelstrom,2015,1,Thrash/Black Metal (early); Melodic Black/Deat...,608789,https://coverartarchive.org/release/be914f73-6...,9/608789.jpg
2,ACOD,The Divine Triumph,2018,3,Thrash/Black Metal (early); Melodic Black/Deat...,731217,https://coverartarchive.org/release/d3e886d8-d...,7/731217.jpg
3,ACOD,Fourth Reign over Opacities and Beyond,2022,7,Thrash/Black Metal (early); Melodic Black/Deat...,1053210,https://coverartarchive.org/release/a81d6c61-9...,0/1053210.jpg
4,ACOD,Versets noirs,2024,9,Thrash/Black Metal (early); Melodic Black/Deat...,1215193,https://coverartarchive.org/release/8dcbe722-0...,3/1215193.jpg


In [10]:
img_paths = [data_dir / fname for fname in df_bm["cover_file"]]
print(sum([p.exists() for p in img_paths]) / len(img_paths))

1.0


## Embedding covers

#### Model Initialization

We use SigLI via OpenCLIP. This model provides a shared embedding space for text and images, allowing us to find images based on natural language descriptions.

Exact implementation: https://huggingface.co/timm/ViT-B-16-SigLIP

In [11]:
import torch
import open_clip

In [12]:
# Device priority: CUDA > MPS > CPU
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Device: {device}")

Device: mps


In [13]:
# Load model
# model_name = "ViT-SO400M-14-SigLIP"
model_name = "ViT-B-16-SigLIP"
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name, pretrained="webli", device=device
)
# Load tokenizer
tokenizer = open_clip.get_tokenizer(model_name)

#### Data Loading & Preprocessing

Images are loaded and transformed to the specific resolution (224x224) required by the SigLIP Vision Transformer.

In [14]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.auto import tqdm
import numpy as np

In [15]:
class AlbumDataset(Dataset):
    """
    A standard PyTorch Dataset for album covers.

    Assumes all image files in 'paths' exist and are valid.
    Loads images, converts them to RGB, and applies SigLIP preprocessing.
    """

    def __init__(self, paths, preprocess):
        """
        Args:
            paths (list): List of pathlib.Path objects.
            preprocess (callable): The image transformation pipeline.
        """
        self.paths = paths
        self.preprocess = preprocess

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        """
        Returns:
            tuple: (preprocessed_tensor, index)
        """
        path = self.paths[idx]
        image = Image.open(path).convert("RGB")
        return self.preprocess(image), idx

In [16]:
dataset = AlbumDataset(img_paths, preprocess)
loader = DataLoader(
    dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=False
)

### Embedding Generation

We iterate through the dataset in batches to generate 768-dimensional vectors using the SigLIP encoder.

Normalization: All vectors are L2 normalized (default p=2). This ensures their magnitude is 1.0, and that a simple dot product gives the Cosine Similarity.

In [None]:
valid_indices = []
embeddings = []

model.eval()
with torch.no_grad():
    for images, indices in tqdm(loader, desc=f"Encoding on {device}"):
        images = images.to(device)

        # Forward pass
        features = model.encode_image(images)

        # Normalize
        features /= features.norm(dim=-1, keepdim=True)

        embeddings.append(features.cpu().numpy())
        valid_indices.extend(indices.tolist())

Encoding on mps:   0%|          | 0/105 [00:00<?, ?it/s]

In [None]:
# Final matrix
all_embs = np.vstack(embeddings)

In [None]:
# 1. Initialize the column
df_bm["embedding"] = None

# 2. Convert the matrix into a list of individual arrays
# This allows Pandas to place one array into each cell
embedding_list = list(all_embs)

# 3. Map them back to the correct rows
df_bm.loc[valid_indices, "embedding"] = pd.Series(list(all_embs), index=valid_indices)

# 4. Create a helper boolean for easy filtering later
df_bm["has_embedding"] = df_bm["embedding"].notna()


# 5. Check how many were mapped
total_rows = len(df_bm)
mapped_count = df_bm["has_embedding"].sum()
# Calculate percentage
pct_success = (mapped_count / total_rows) * 100

print(
    f"Successfully mapped {mapped_count} embeddings to the DataFrame "
    f"({pct_success:.2f}% success rate)."
)

Successfully mapped 3355 embeddings to the DataFrame (100.00% success rate).


In [None]:
all_embs.shape

NameError: name 'embedding' is not defined

In [None]:
# Save to Pickle: Preserves NumPy arrays natively and requires no extra dependencies.
# File size: ~16MB
cols_to_save = ["band", "album", "year", "cover_url", "embedding", "album_id"]
df_bm[df_bm["has_embedding"]][cols_to_save].to_pickle(
    "../data/bm_covers_with_embeddings.pkl"
)

### Semantic Search Validation

Verify SigLIP's ability to retrieve relevant album covers using descriptive text queries.

In [None]:
df = pd.read_pickle("../data/bm_covers_with_embeddings.pkl")
df.head()

Unnamed: 0,band,album,year,cover_url,embedding,album_id
0,ACOD,First Earth Poison,2011,https://coverartarchive.org/release/8dc5a614-f...,"[-0.042004008, -0.00014589708, -0.008899495, -...",336533
1,ACOD,II The Maelstrom,2015,https://coverartarchive.org/release/be914f73-6...,"[-0.04113701, -0.035046946, -0.032143474, -0.0...",608789
2,ACOD,The Divine Triumph,2018,https://coverartarchive.org/release/d3e886d8-d...,"[-0.02917442, -0.025995309, -0.036073618, 0.00...",731217
3,ACOD,Fourth Reign over Opacities and Beyond,2022,https://coverartarchive.org/release/a81d6c61-9...,"[-0.038749337, -0.039249375, -0.061301745, 0.0...",1053210
4,ACOD,Versets noirs,2024,https://coverartarchive.org/release/8dcbe722-0...,"[-0.020338822, -0.0030262307, 0.017796148, 0.0...",1215193


In [None]:
def encode_text_query(text, model, tokenizer, device):
    """Encode text query using SigLIP-specific tokenizer"""
    text_tokens = tokenizer([text]).to(device)

    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        return text_features.cpu().numpy()

In [None]:
def find_top_albums(query_text, df, model, tokenizer, device, top_k=10):
    """Search the dataframe and return top matches"""
    # 1. Get query vector
    query_emb = encode_text_query(query_text, model, tokenizer, device)

    # 2. Extract matrix of valid embeddings [N, 768]
    matrix = np.vstack(df["embedding"].values)

    # 3. Calculate scores (Dot product = Cosine Similarity for normalized vectors)
    scores = (query_emb @ matrix.T).flatten()

    # 4. Get top indices
    best_indices = scores.argsort()[::-1][:top_k]

    # 5. Return the top rows with their scores
    results = df.iloc[best_indices].copy()
    results["search_score"] = scores[best_indices]
    return results

In [None]:
# --- Semantic Search Sandbox ---
query = "a dark castle"
top_10 = find_top_albums(query, df, model, tokenizer, device)

In [None]:
from IPython.display import display, HTML


def display_top_3_html(results_df):
    html_str = '<div style="display: flex; gap: 20px; align-items: flex-start;">'

    for _, row in results_df.head(3).iterrows():
        html_str += f"""
        <div style="flex: 1; text-align: center; max-width: 250px;">
            <img src="{row["cover_url"]}" style="width: 100%; border-radius: 8px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
            <p style="margin: 8px 0 0 0; font-weight: bold; font-family: sans-serif;">{row["band"]}</p>
            <p style="margin: 2px 0; font-style: italic; font-family: sans-serif;">{row["album"]}</p>
            <p style="color: #666; font-size: 0.8em; font-family: sans-serif;">Score: {row["search_score"]:.3f}</p>
        </div>
        """

    html_str += "</div>"
    display(HTML(html_str))


display_top_3_html(top_10)