# Text Embedding Comparison

The Notebook creates and visualizes 200 text embeddings at 512 dimensions each, projected into 2D for visualization, across four popular open weight text embedding models from Google, Qwen, IBM, and Tencent. Even with identical inputs and dimensionality, each model induces its own embedding space—with different clusters, separations, and neighborhood relationships—which is why production systems need explicit embedding‑model versioning and a full re‑embedding plus re‑indexing step whenever the underlying model changes.

## Install Dependencies

In [None]:
%pip install pip -Uq

In [None]:
%pip install -r requirements.txt -Uq

In [None]:
import os

# Disable tokenizers parallelism warnings in notebook contexts
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Authenticate with Hugging Face

**IMPORTANT: Complete these steps in order:**

1. **Request model access**: Visit https://huggingface.co/google/embeddinggemma-300m and click "Request access to this model" (requires a free Hugging Face account)
2. **Wait for approval**: Access is usually granted immediately or within a few minutes
3. **Get your token**: Go to https://huggingface.co/settings/tokens and create a new token (read permission is sufficient)
4. **Run the login cell below**: Execute the next cell and paste your token in the text box that appears
5. **Verify login**: Run the verification cell to confirm you're authenticated

In [None]:
from huggingface_hub import notebook_login

# Login to Hugging Face (this will show a widget for entering your token)
notebook_login()

In [None]:
# Verify authentication status
from huggingface_hub import whoami

try:
    user_info = whoami()
    print(f"✓ Successfully logged in as: {user_info['name']}")
    print(f"✓ Authentication token is valid")
except Exception as e:
    print("✗ Not logged in or token is invalid")
    print(f"Error: {e}")

## Load Quotes Dataset

In [None]:
import json


def load_all_quotes(file_path):
    quotes = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            quote = json.loads(line)
            quotes.append(quote["inputs"])
    return quotes


file_path = "quotes/quotes_200.jsonl"
quotes = load_all_quotes(file_path)
print(f"Total number of quotes: {len(quotes)}")
print(f"First quote in list: {quotes[0]}")

## Common Methods

In [None]:
from sentence_transformers import SentenceTransformer
from torch import Tensor


def compute_similarity_test(model: SentenceTransformer) -> Tensor:
    """Compute the similarity between queries and answers using the given model.

    Args:
        model (SentenceTransformer): The sentence transformer model to use for encoding and similarity computation.

    Returns:
        Tensor: A tensor containing the similarity scores between each query and answer.
    """
    # The queries and quotes to embed
    queries = [
        "What is the capital of China?",
        "Explain gravity",
    ]
    answers = [
        "The capital of China is Beijing.",
        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
    ]

    # Encode the queries and quotes. Note that queries benefit from using a prompt
    # Here we use the prompt called "query" stored under `model.prompts`, but you can
    # also pass your own prompt via the `prompt` argument
    query_embeddings = model.encode_query(queries, prompt_name="query")
    quote_embeddings = model.encode_document(answers)

    # Compute the (cosine) similarity between the query and quote embeddings
    similarity = model.similarity(query_embeddings, quote_embeddings)

    return similarity

In [None]:
from numpy import ndarray


def generate_embeddings(model: SentenceTransformer, quotes: list[str], dimensions: int=512) -> ndarray:
    """Embed a list of quotes using the given model and measure the time taken.

    Args:
        model (SentenceTransformer): The sentence transformer model to use for encoding.
        quotes (list[str]): A list of quotes to embed.
    """
    import time

    start_time = time.time()
    quote_embeddings = model.encode(
        quotes,
        batch_size=32,
        show_progress_bar=True,
        truncate_dim=dimensions,  # <- desired output dim
    )
    end_time = time.time()

    print(f"Time taken: {end_time - start_time} seconds")
    print(f"Time per embedding: {(end_time - start_time) / len(quotes)} seconds")

    return quote_embeddings

In [None]:
import numpy as np


def save_embeddings(quote_embeddings: ndarray, file_name: str) -> str:
    """Save the quote embeddings to a file.

    Args:
        quote_embeddings (ndarray): The quote embeddings to save.
        file_name (str): The name of the file to save the embeddings to.

    Returns:
        str: The path to the saved embeddings file.
    """
    embeddings_path = os.path.join("embeddings", file_name)
    np.save(embeddings_path, quote_embeddings)

    return embeddings_path

In [None]:
def save_embeddings_dimension_test(quote_embeddings: ndarray, file_name: str) -> str:
    """Save the quote embeddings to a file in the dimension_tests folder.

    Args:
        quote_embeddings (ndarray): The quote embeddings to save.
        file_name (str): The name of the file to save the embeddings to.

    Returns:
        str: The path to the saved embeddings file.
    """
    embeddings_path = os.path.join("dimension_tests", file_name)
    os.makedirs("dimension_tests", exist_ok=True)
    np.save(embeddings_path, quote_embeddings)

    return embeddings_path

In [None]:
def generate_embeddings_multi_dimension(
    model: SentenceTransformer,
    model_name: str,
    quotes: list[str],
    dimensions: list[int] = [128, 256, 512, 768],
) -> dict[int, str]:
    """Generate embeddings at multiple dimensions and save them to dimension_tests folder.

    Args:
        model (SentenceTransformer): The sentence transformer model to use for encoding.
        model_name (str): The name of the model (used for file naming).
        quotes (list[str]): A list of quotes to embed.
        dimensions (list[int]): List of dimension sizes to generate embeddings for.

    Returns:
        dict[int, str]: Dictionary mapping dimension sizes to file paths.
    """
    results = {}
    
    print(f"Generating embeddings for model: {model_name}")
    print(f"Dimensions to test: {dimensions}")
    print("=" * 80)
    
    for dim in dimensions:
        print(f"\nProcessing dimension: {dim}")
        print("-" * 40)
        
        # Generate embeddings at this dimension
        embeddings = generate_embeddings(model, quotes, dimensions=dim)
        
        # Save embeddings
        file_name = f"{model_name}-{dim}.npy"
        file_path = save_embeddings_dimension_test(embeddings, file_name)
        
        print(f"Saved to: {file_path}")
        results[dim] = file_path
    
    print("\n" + "=" * 80)
    print(f"Completed! Generated embeddings for {len(dimensions)} dimensions")
    print(f"All files saved to: dimension_tests/")
    
    return results

In [None]:
def load_embeddings(embeddings_path: str) -> ndarray:
    """Load embeddings from a file.

    Args:
        embeddings_path (str): Path to the file containing the saved embeddings.

    Returns:
        ndarray: The loaded embeddings as a NumPy array.
    """
    embeddings = np.load(embeddings_path)

    return embeddings

## Models

### Model 1: EmbeddingGemma

`google/embeddinggemma-300m`

https://huggingface.co/google/embeddinggemma-300m

In [None]:
# Load the model
google_model = SentenceTransformer("google/embeddinggemma-300m")
print(f"Model loaded: {google_model.model_card_data}")

In [None]:
# Run test inference with queries and answers
similarities = compute_similarity_test(google_model)
print(f"Similarities: {similarities}")

In [None]:
# Embed the quotes
embeddings = generate_embeddings(google_model, quotes, 512)
print(f"Shape: {embeddings.shape}")  # (200, 512)

In [None]:
# Save the quote embeddings to a file for later use
embeddings_path = save_embeddings(embeddings, "google-embedding-gemma-300m-512.npy")
print(f"Embeddings saved to: {embeddings_path}")

In [None]:
# Load the quote embeddings from the file
loaded_embeddings = load_embeddings(embeddings_path)
print(f"Shape: {loaded_embeddings.shape}")

### Model 2: Qwen3 Embedding 0.6B

`Qwen/Qwen3-Embedding-0.6B`

https://huggingface.co/Qwen/Qwen3-Embedding-0.6B

In [None]:
# Load the model
qwen_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
print(f"Model loaded: {qwen_model.model_card_data}")

In [None]:
# Run test inference with queries and answers
similarities = compute_similarity_test(qwen_model)
print(f"Similarities: {similarities}")

In [None]:
# Embed the quotes
embeddings = generate_embeddings(qwen_model, quotes, 512)
print(f"Shape: {embeddings.shape}")  # (200, 512)

In [None]:
# Save the quote embeddings to a file for later use
embeddings_path = save_embeddings(embeddings, "qwen-qwen3-embedding-0.6b-512.npy")
print(f"Embeddings saved to: {embeddings_path}")

In [None]:
# Load the quote embeddings from the file
loaded_embeddings = load_embeddings(embeddings_path)
print(f"Shape: {loaded_embeddings.shape}")

### Model 3: IBM Granite Embedding 125m English

`ibm-granite/granite-embedding-125m-english`

https://huggingface.co/ibm-granite/granite-embedding-125m-english

In [None]:
# Load the model
ibm_model = SentenceTransformer("ibm-granite/granite-embedding-125m-english")
print(f"Model loaded: {ibm_model.model_card_data}")

In [None]:
# Run test inference with queries and answers
similarities = compute_similarity_test(ibm_model)
print(f"Similarities: {similarities}")

In [None]:
# Embed the quotes
embeddings = generate_embeddings(ibm_model, quotes, 512)
print(f"Shape: {embeddings.shape}")  # (200, 512)

In [None]:
# Save the quote embeddings to a file for later use
embeddings_path = save_embeddings(
    embeddings, "ibm-granite-embedding-125m-english-512.npy"
)
print(f"Embeddings saved to: {embeddings_path}")

In [None]:
# Load the quote embeddings from the file
loaded_embeddings = load_embeddings(embeddings_path)
print(f"Shape: {loaded_embeddings.shape}")

### Model 4: TencentBAC Conan Embedding v1

`TencentBAC/Conan-embedding-v1`

https://huggingface.co/TencentBAC/Conan-embedding-v1

In [None]:
# Load the model
tencent_model = SentenceTransformer("TencentBAC/Conan-embedding-v1")
print(f"Model loaded: {tencent_model.model_card_data}")

In [None]:
# Run test inference with queries and answers
similarities = compute_similarity_test(tencent_model)
print(f"Similarities: {similarities}")

In [None]:
# Embed the quotes
embeddings = generate_embeddings(tencent_model, quotes, 512)
print(f"Shape: {embeddings.shape}")  # (200, 512)

In [None]:
# Save the quote embeddings to a file for later use
embeddings_path = save_embeddings(embeddings, "tencentbac-conan-embedding-v1-512.npy")
print(f"Embeddings saved to: {embeddings_path}")

In [None]:
# Load the quote embeddings from the file
loaded_embeddings = load_embeddings(embeddings_path)
print(f"Shape: {loaded_embeddings.shape}")

## Visualization Methods

**`normalize_embeddings()`**
- L2 normalizes embeddings to unit length
- Standardizes to zero mean and unit variance
- Ensures all models are on comparable scales

**`visualize_multiple_embeddings_improved()`**
- Normalizes each model's embeddings separately before combining
- Reports PCA explained variance ratio
- Supports both PCA and t-SNE
- Includes hover text with quote content
- Better for direct comparison when normalization is appropriate

**`visualize_embeddings_separately()`**
- Applies PCA/t-SNE independently to each model
- Shows true structure of each embedding space
- No cross-contamination between models
- Side-by-side subplots for comparison
- Better for understanding individual model characteristics

### When to Use Which:
- **Separate visualization** (`visualize_embeddings_separately`): Best for understanding each model's embedding space structure independently
- **Combined normalized** (`visualize_multiple_embeddings_improved`): Best for direct comparison when you want to see relative positions across models

### Visualization Methods

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
def normalize_embeddings(embeddings: ndarray) -> ndarray:
    """Normalize embeddings using L2 normalization followed by standardization.

    This ensures embeddings from different models are on comparable scales:
    1. L2 normalization: Scale each embedding vector to unit length
    2. Standardization: Zero mean and unit variance per dimension

    Args:
        embeddings (ndarray): The embeddings to normalize (shape: [n_samples, n_dimensions])

    Returns:
        ndarray: Normalized embeddings with the same shape
    """
    # Step 1: L2 normalize each embedding vector to unit length
    # This makes all vectors lie on a hypersphere
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    # Avoid division by zero
    norms = np.where(norms == 0, 1, norms)
    l2_normalized = embeddings / norms

    # Step 2: Standardize to zero mean and unit variance per dimension
    # This ensures different models have comparable variance structures
    mean = l2_normalized.mean(axis=0)
    std = l2_normalized.std(axis=0)
    # Avoid division by zero for constant dimensions
    std = np.where(std == 0, 1, std)
    standardized = (l2_normalized - mean) / std

    return standardized

In [None]:
def visualize_multiple_embeddings_improved(
    embeddings_list: list[ndarray],
    model_names: list[str],
    quotes: list[str] = None,
    method: str = "pca",
):
    """Visualize multiple sets of embeddings in 2D space with proper normalization.

    This function addresses methodological issues by:
    1. Normalizing each model's embeddings separately before combining
    2. Reporting explained variance for PCA
    3. Supporting both PCA and t-SNE
    4. Adding hover text with quote content

    Args:
        embeddings_list (list[ndarray]): A list of embeddings to visualize.
        model_names (list[str]): A list of model names corresponding to the embeddings.
        quotes (list[str], optional): Original quote texts for hover display.
        method (str): Dimensionality reduction method - "pca" or "tsne"
    """
    print("Normalizing embeddings for each model separately...")
    normalized_embeddings = []
    for i, emb in enumerate(embeddings_list):
        norm_emb = normalize_embeddings(emb)
        normalized_embeddings.append(norm_emb)
        print(f"  {model_names[i]}: normalized {emb.shape[0]} embeddings")

    # Combine normalized embeddings
    combined_embeddings = np.vstack(normalized_embeddings)
    print(f"\nCombined shape: {combined_embeddings.shape}")

    # Apply dimensionality reduction
    if method.lower() == "pca":
        reducer = PCA(n_components=2)
        reduced_embeddings = reducer.fit_transform(combined_embeddings)

        # Report explained variance - critical for understanding information loss
        print(f"\nPCA Explained Variance:")
        print(f"  PC1: {reducer.explained_variance_ratio_[0]:.2%}")
        print(f"  PC2: {reducer.explained_variance_ratio_[1]:.2%}")
        print(f"  Total: {reducer.explained_variance_ratio_.sum():.2%}")

        axis_labels = {"x": "Principal Component 1", "y": "Principal Component 2"}
        title_method = "PCA"
    elif method.lower() == "tsne":

        print("\nApplying t-SNE (this may take a moment)...")
        reducer = TSNE(n_components=2, random_state=42, perplexity=30)
        reduced_embeddings = reducer.fit_transform(combined_embeddings)
        axis_labels = {"x": "t-SNE Dimension 1", "y": "t-SNE Dimension 2"}
        title_method = "t-SNE"
    else:
        raise ValueError(f"Unknown method: {method}. Use 'pca' or 'tsne'")

    # Create a DataFrame for Plotly
    df = pd.DataFrame(reduced_embeddings, columns=["dim1", "dim2"])
    df["Model"] = np.repeat(model_names, [emb.shape[0] for emb in embeddings_list])

    # Add quote text for hover if provided
    if quotes is not None:
        # Repeat quotes for each model
        all_quotes = quotes * len(embeddings_list)
        # Truncate quotes to 50 characters for hover display
        df["Quote"] = [q[:50] + "..." if len(q) > 50 else q for q in all_quotes]
        hover_data = {"Quote": True, "Model": True, "dim1": ":.3f", "dim2": ":.3f"}
    else:
        hover_data = {"Model": True, "dim1": ":.3f", "dim2": ":.3f"}

    # Create scatter plot
    fig = px.scatter(
        df,
        x="dim1",
        y="dim2",
        color="Model",
        color_discrete_sequence=px.colors.qualitative.Vivid,
        title=f"2D Visualization of Embeddings ({title_method}, Normalized)",
        labels={"dim1": axis_labels["x"], "dim2": axis_labels["y"]},
        hover_data=hover_data,
    )

    fig.update_traces(marker=dict(size=6, opacity=0.7))

    # Make title bold and centered, set Arial font, and increase resolution
    fig.update_layout(
        title={
            "text": f"<b>2D Visualization of Embeddings ({title_method}, Normalized)</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        font={"family": "Arial, sans-serif", "size": 12},
        width=1200,
        height=600,
    )

    # Show with high resolution
    fig.show(config={"toImageButtonOptions": {"format": "png", "scale": 3}})

In [None]:
def visualize_multiple_embeddings_animated(
    embeddings_list: list[ndarray],
    model_names: list[str],
    quotes: list[str] = None,
    method: str = "tsne",
    num_frames: int = 60,
    frame_duration: int = 50,
    loop: bool = False,
    sequential: bool = False,
    highlight_quote_idx: int = None,
):
    """Visualize multiple sets of embeddings with animation from origin to final positions.

    All markers start at (0, 0) and animate smoothly to their final t-SNE or PCA coordinates.

    Args:
        embeddings_list (list[ndarray]): A list of embeddings to visualize.
        model_names (list[str]): A list of model names corresponding to the embeddings.
        quotes (list[str], optional): Original quote texts for hover display.
        method (str): Dimensionality reduction method - "pca" or "tsne"
        num_frames (int): Number of animation frames (default: 60)
        frame_duration (int): Duration of each frame in milliseconds (default: 50ms)
        loop (bool): Whether to loop the animation (default: False)
        sequential (bool): If True, animate each model group sequentially instead of all at once
        highlight_quote_idx (int): Index of quote to highlight across all models (0-based)
    """
    print("Normalizing embeddings for each model separately...")
    normalized_embeddings = []
    for i, emb in enumerate(embeddings_list):
        norm_emb = normalize_embeddings(emb)
        normalized_embeddings.append(norm_emb)
        print(f"  {model_names[i]}: normalized {emb.shape[0]} embeddings")

    # Combine normalized embeddings
    combined_embeddings = np.vstack(normalized_embeddings)
    print(f"\nCombined shape: {combined_embeddings.shape}")

    # Apply dimensionality reduction
    if method.lower() == "pca":
        reducer = PCA(n_components=2)
        reduced_embeddings = reducer.fit_transform(combined_embeddings)

        print(f"\nPCA Explained Variance:")
        print(f"  PC1: {reducer.explained_variance_ratio_[0]:.2%}")
        print(f"  PC2: {reducer.explained_variance_ratio_[1]:.2%}")
        print(f"  Total: {reducer.explained_variance_ratio_.sum():.2%}")

        axis_labels = {"x": "Principal Component 1", "y": "Principal Component 2"}
        title_method = "PCA"
    elif method.lower() == "tsne":
        print("\nApplying t-SNE (this may take a moment)...")
        reducer = TSNE(n_components=2, random_state=42, perplexity=30)
        reduced_embeddings = reducer.fit_transform(combined_embeddings)
        axis_labels = {"x": "t-SNE Dimension 1", "y": "t-SNE Dimension 2"}
        title_method = "t-SNE"
    else:
        raise ValueError(f"Unknown method: {method}. Use 'pca' or 'tsne'")

    # Create DataFrame with final positions
    df = pd.DataFrame(reduced_embeddings, columns=["dim1", "dim2"])
    df["Model"] = np.repeat(model_names, [emb.shape[0] for emb in embeddings_list])

    # Add quote text for hover if provided
    if quotes is not None:
        all_quotes = quotes * len(embeddings_list)
        df["Quote"] = all_quotes

    # Calculate axis ranges from final positions with padding
    x_min, x_max = df["dim1"].min(), df["dim1"].max()
    y_min, y_max = df["dim2"].min(), df["dim2"].max()

    # Add 10% padding to ensure all points are visible
    x_padding = (x_max - x_min) * 0.1
    y_padding = (y_max - y_min) * 0.1
    x_range = [x_min - x_padding, x_max + x_padding]
    y_range = [y_min - y_padding, y_max + y_padding]

    print(f"Axis ranges: x={x_range}, y={y_range}")
    
    if highlight_quote_idx is not None:
        print(f"\nHighlighting quote #{highlight_quote_idx}: \"{quotes[highlight_quote_idx][:60]}...\"")

    # Get color mapping for models
    colors = px.colors.qualitative.Vivid
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(model_names)}

    # Helper function to truncate quotes for hover text
    def truncate_quote(quote, max_length=50):
        """Truncate quote to max_length characters and add ellipsis if needed."""
        if len(quote) <= max_length:
            return quote
        return quote[:max_length] + "..."

    # Calculate timing for sequential animation
    num_models = len(model_names)
    if sequential:
        frames_per_model = num_frames // num_models
        print(f"\nSequential animation: {frames_per_model} frames per model")
    
    print(f"\nGenerating {num_frames} animation frames...")

    # Create initial frame (all points at origin)
    initial_traces = []
    for model in model_names:
        model_df = df[df["Model"] == model]
        
        # Separate regular points from highlighted point
        if highlight_quote_idx is not None:
            regular_mask = [i != highlight_quote_idx for i in range(len(model_df))]
            highlight_mask = [i == highlight_quote_idx for i in range(len(model_df))]
            
            # Regular points
            if any(regular_mask):
                if quotes is not None:
                    hover_text = [
                        f"Quote: {truncate_quote(q)}<br>Model: {model}<br>x: 0.000<br>y: 0.000"
                        for i, q in enumerate(model_df["Quote"]) if regular_mask[i]
                    ]
                else:
                    hover_text = [f"Model: {model}<br>x: 0.000<br>y: 0.000"] * sum(regular_mask)

                trace = go.Scatter(
                    x=[0] * sum(regular_mask),
                    y=[0] * sum(regular_mask),
                    mode="markers",
                    marker=dict(size=6, opacity=0.4, color=color_map[model]),
                    name=model,
                    text=hover_text,
                    hovertemplate="%{text}<extra></extra>",
                    showlegend=True,
                )
                initial_traces.append(trace)
            
            # Highlighted point
            if any(highlight_mask):
                if quotes is not None:
                    hover_text = [f"HIGHLIGHTED<br>Quote: {truncate_quote(q)}<br>Model: {model}<br>x: 0.000<br>y: 0.000"
                                for i, q in enumerate(model_df["Quote"]) if highlight_mask[i]]
                else:
                    hover_text = [f"HIGHLIGHTED<br>Model: {model}<br>x: 0.000<br>y: 0.000"]

                trace = go.Scatter(
                    x=[0],
                    y=[0],
                    mode="markers",
                    marker=dict(
                        size=15,
                        opacity=1.0,
                        color=color_map[model],
                        line=dict(width=2, color='white')
                    ),
                    name=f"{model} (highlighted)",
                    text=hover_text,
                    hovertemplate="%{text}<extra></extra>",
                    showlegend=False,
                )
                initial_traces.append(trace)
        else:
            # No highlighting - original behavior
            if quotes is not None:
                hover_text = [
                    f"Quote: {truncate_quote(q)}<br>Model: {model}<br>x: 0.000<br>y: 0.000"
                    for q in model_df["Quote"]
                ]
            else:
                hover_text = [f"Model: {model}<br>x: 0.000<br>y: 0.000"] * len(model_df)

            trace = go.Scatter(
                x=[0] * len(model_df),
                y=[0] * len(model_df),
                mode="markers",
                marker=dict(size=6, opacity=0.4, color=color_map[model]),
                name=model,
                text=hover_text,
                hovertemplate="%{text}<extra></extra>",
            )
            initial_traces.append(trace)

    # Create animation frames
    frames = []
    for frame_idx in range(num_frames):
        frame_traces = []
        
        for model_idx, model in enumerate(model_names):
            model_df = df[df["Model"] == model].reset_index(drop=True)
            
            if sequential:
                # Calculate which model should be animating at this frame
                start_frame = model_idx * frames_per_model
                end_frame = start_frame + frames_per_model
                
                if frame_idx < start_frame:
                    t = 0.0
                elif frame_idx >= end_frame:
                    t = 1.0
                else:
                    t = (frame_idx - start_frame) / frames_per_model
            else:
                t = frame_idx / (num_frames - 1)
            
            # Separate regular points from highlighted point
            if highlight_quote_idx is not None:
                regular_mask = [i != highlight_quote_idx for i in range(len(model_df))]
                highlight_mask = [i == highlight_quote_idx for i in range(len(model_df))]
                
                # Regular points
                if any(regular_mask):
                    regular_df = model_df[regular_mask]
                    x_positions = t * regular_df["dim1"].values
                    y_positions = t * regular_df["dim2"].values

                    if quotes is not None:
                        hover_text = [
                            f"Quote: {truncate_quote(q)}<br>Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                            for q, x, y in zip(regular_df["Quote"], x_positions, y_positions)
                        ]
                    else:
                        hover_text = [
                            f"Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                            for x, y in zip(x_positions, y_positions)
                        ]

                    trace = go.Scatter(
                        x=x_positions,
                        y=y_positions,
                        mode="markers",
                        marker=dict(size=6, opacity=0.4, color=color_map[model]),
                        name=model,
                        text=hover_text,
                        hovertemplate="%{text}<extra></extra>",
                        showlegend=True,
                    )
                    frame_traces.append(trace)
                
                # Highlighted point
                if any(highlight_mask):
                    highlight_df = model_df[highlight_mask]
                    x_positions = t * highlight_df["dim1"].values
                    y_positions = t * highlight_df["dim2"].values

                    if quotes is not None:
                        hover_text = [f"HIGHLIGHTED<br>Quote: {truncate_quote(q)}<br>Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                                    for q, x, y in zip(highlight_df["Quote"], x_positions, y_positions)]
                    else:
                        hover_text = [f"HIGHLIGHTED<br>Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                                    for x, y in zip(x_positions, y_positions)]

                    trace = go.Scatter(
                        x=x_positions,
                        y=y_positions,
                        mode="markers",
                        marker=dict(
                            size=15,
                            opacity=1.0,
                            color=color_map[model],
                            line=dict(width=2, color='white')
                        ),
                        name=f"{model} (highlighted)",
                        text=hover_text,
                        hovertemplate="%{text}<extra></extra>",
                        showlegend=False,
                    )
                    frame_traces.append(trace)
            else:
                # No highlighting - original behavior
                x_positions = t * model_df["dim1"].values
                y_positions = t * model_df["dim2"].values

                if quotes is not None:
                    hover_text = [
                        f"Quote: {truncate_quote(q)}<br>Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                        for q, x, y in zip(model_df["Quote"], x_positions, y_positions)
                    ]
                else:
                    hover_text = [
                        f"Model: {model}<br>x: {x:.3f}<br>y: {y:.3f}"
                        for x, y in zip(x_positions, y_positions)
                    ]

                trace = go.Scatter(
                    x=x_positions,
                    y=y_positions,
                    mode="markers",
                    marker=dict(size=6, opacity=0.4, color=color_map[model]),
                    name=model,
                    text=hover_text,
                    hovertemplate="%{text}<extra></extra>",
                )
                frame_traces.append(trace)

        frames.append(go.Frame(data=frame_traces, name=str(frame_idx)))

    # Create figure with initial frame
    fig = go.Figure(data=initial_traces, frames=frames)

    # Update layout with animation controls and fixed axis ranges
    animation_type = "Sequential" if sequential else "Synchronized"
    
    # Prepare annotations list
    annotations = []
    
    # Add annotation explaining highlighted markers if applicable
    if highlight_quote_idx is not None and quotes is not None:
        annotations.append(
            dict(
                x=0.02,
                y=0.98,
                xref="paper",
                yref="paper",
                text=f"<b>Larger markers (●):</b> Same text input across all models",
                showarrow=False,
                font=dict(size=11, family="Arial, sans-serif", color="#333"),
                align="left",
                bgcolor="rgba(255, 255, 255, 0.85)",
                bordercolor="#666",
                borderwidth=1,
                borderpad=8,
                xanchor="left",
                yanchor="top",
            )
        )
    
    fig.update_layout(
        title={
            "text": f"<b>2D Visualization of Embeddings ({title_method}, Normalized)</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        xaxis={"title": axis_labels["x"], "range": x_range},
        yaxis={"title": axis_labels["y"], "range": y_range},
        font={"family": "Arial, sans-serif", "size": 12},
        width=1200,
        height=600,
        annotations=annotations,
        updatemenus=[
            {
                "type": "buttons",
                "showactive": False,
                "buttons": [
                    {
                        "label": "Play",
                        "method": "animate",
                        "args": [
                            None,
                            {
                                "frame": {"duration": frame_duration, "redraw": True},
                                "fromcurrent": True,
                                "mode": "immediate" if not loop else "loop",
                                "transition": {"duration": 0, "easing": "linear"},
                            },
                        ],
                    },
                    {
                        "label": "Pause",
                        "method": "animate",
                        "args": [
                            [None],
                            {
                                "frame": {"duration": 0, "redraw": False},
                                "mode": "immediate",
                                "transition": {"duration": 0},
                            },
                        ],
                    },
                ],
                "x": 0.1,
                "y": 0,
                "xanchor": "right",
                "yanchor": "top",
            }
        ],
        sliders=[
            {
                "active": 0,
                "steps": [
                    {
                        "args": [
                            [str(k)],
                            {
                                "frame": {"duration": 0, "redraw": True},
                                "mode": "immediate",
                                "transition": {"duration": 0},
                            },
                        ],
                        "label": str(k),
                        "method": "animate",
                    }
                    for k in range(num_frames)
                ],
                "x": 0.1,
                "len": 0.9,
                "xanchor": "left",
                "y": 0,
                "yanchor": "top",
                "pad": {"b": 10, "t": 50},
                "currentvalue": {
                    "visible": True,
                    "prefix": "Frame: ",
                    "xanchor": "right",
                    "font": {"size": 14},
                },
                "transition": {"duration": 0},
            }
        ],
    )

    print("Animation ready!")
    
    # Show with high resolution
    fig.show(config={"toImageButtonOptions": {"format": "png", "scale": 3}})
    
    return fig

In [None]:
def visualize_embeddings_separately(
    embeddings_list: list[ndarray],
    model_names: list[str],
    quotes: list[str] = None,
    method: str = "pca",
    shared_axes: bool = True,
):
    """Visualize embeddings with separate dimensionality reduction per model.

    This approach applies PCA/t-SNE independently to each model's embeddings,
    showing the true structure of each embedding space without cross-contamination.
    Models are displayed side-by-side in subplots for comparison.

    Args:
        embeddings_list (list[ndarray]): A list of embeddings to visualize.
        model_names (list[str]): A list of model names corresponding to the embeddings.
        quotes (list[str], optional): Original quote texts for hover display.
        method (str): Dimensionality reduction method - "pca" or "tsne"
        shared_axes (bool): If True, all subplots use the same x/y axis ranges for direct comparison
    """
    n_models = len(embeddings_list)
    fig = make_subplots(
        rows=1, cols=n_models, subplot_titles=model_names, horizontal_spacing=0.05
    )

    # First pass: compute all reductions and find global ranges if needed
    all_reduced = []
    all_explained_vars = []

    for embeddings in embeddings_list:
        # Apply dimensionality reduction independently
        if method.lower() == "pca":
            reducer = PCA(n_components=2)
            reduced = reducer.fit_transform(embeddings)
            explained_var = reducer.explained_variance_ratio_.sum()
            all_explained_vars.append(explained_var)
        elif method.lower() == "tsne":
            reducer = TSNE(n_components=2, random_state=42)
            reduced = reducer.fit_transform(embeddings)
            all_explained_vars.append(None)
        else:
            raise ValueError(f"Unknown method: {method}")

        all_reduced.append(reduced)

    # Compute shared axis ranges if requested
    if shared_axes:
        all_x = np.concatenate([r[:, 0] for r in all_reduced])
        all_y = np.concatenate([r[:, 1] for r in all_reduced])
        x_min, x_max = all_x.min(), all_x.max()
        y_min, y_max = all_y.min(), all_y.max()
        # Add small padding (5%)
        x_padding = (x_max - x_min) * 0.05
        y_padding = (y_max - y_min) * 0.05
        x_range = [x_min - x_padding, x_max + x_padding]
        y_range = [y_min - y_padding, y_max + y_padding]

    # Second pass: create plots
    for i, (reduced, model_name, explained_var) in enumerate(
        zip(all_reduced, model_names, all_explained_vars)
    ):
        # Prepare subtitle with explained variance
        if explained_var is not None:
            subtitle_suffix = f"<br>(Explained var: {explained_var:.1%})"
        else:
            subtitle_suffix = ""

        # Prepare hover text with truncated quotes
        if quotes is not None:
            hover_text = [
                f"Quote: {q[:50]}..." if len(q) > 50 else f"Quote: {q}"
                for q in quotes
            ]
        else:
            hover_text = None

        # Add scatter trace
        fig.add_trace(
            go.Scatter(
                x=reduced[:, 0],
                y=reduced[:, 1],
                mode="markers",
                marker=dict(size=5, opacity=0.6, color=px.colors.qualitative.Vivid[i]),
                text=hover_text,
                hovertemplate="%{text}<br>x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>",
                showlegend=False,
            ),
            row=1,
            col=i + 1,
        )

        # Update subplot title with explained variance and font
        fig.layout.annotations[i].update(
            text=model_name + subtitle_suffix,
            font=dict(family="Arial, sans-serif", size=14),
        )

        # Set axis ranges
        if shared_axes:
            fig.update_xaxes(range=x_range, row=1, col=i + 1)
            fig.update_yaxes(range=y_range, row=1, col=i + 1)

    method_name = "PCA" if method.lower() == "pca" else "t-SNE"
    axes_note = " (Shared Axes)" if shared_axes else " (Independent Axes)"

    # Make title bold and centered, set Arial font, and increase resolution
    fig.update_layout(
        title={
            "text": f"<b>Separate {method_name} per Model{axes_note}</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        font={"family": "Arial, sans-serif", "size": 12},
        width=1600,
        height=400,
        showlegend=False,
    )

    # Show with high resolution
    fig.show(config={"toImageButtonOptions": {"format": "png", "scale": 3}})

### Create Visualizations

In [None]:
# Load the embeddings from all three models
embeddings_google = load_embeddings("embeddings/google-embedding-gemma-300m-512.npy")
embeddings_qwen = load_embeddings("embeddings/qwen-qwen3-embedding-0.6b-512.npy")
embeddings_ibm = load_embeddings(
    "embeddings/ibm-granite-embedding-125m-english-512.npy"
)
embeddings_tencent = load_embeddings("embeddings/tencentbac-conan-embedding-v1-512.npy")

print(f"Loaded embeddings:")
print(f"  Google: {embeddings_google.shape}")
print(f"  Qwen: {embeddings_qwen.shape}")
print(f"  IBM: {embeddings_ibm.shape}")
print(f"  Tencent: {embeddings_tencent.shape}")

In [None]:
# Example 1: Combined visualization with normalization (PCA)
print("=" * 80)
print("METHOD 1: Combined PCA with Normalization")
print("=" * 80)

visualize_multiple_embeddings_improved(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes=quotes,
    method="pca",
)

In [None]:
# Example 2: Separate PCA per model (shows true structure of each space)
print("=" * 80)
print("METHOD 2: Separate PCA per Model")
print("=" * 80)
print("Each model gets its own PCA transformation - no cross-contamination\n")

visualize_embeddings_separately(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes=quotes,
    method="pca",
    shared_axes=False,
)

In [None]:
# Example 3: Combined t-SNE with normalization (better for local structure)
print("=" * 80)
print("METHOD 3: Combined t-SNE with Normalization")
print("=" * 80)
print("t-SNE preserves local structure better than PCA\n")

visualize_multiple_embeddings_improved(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes=quotes,
    method="tsne",
)

In [None]:
# Example 4: Animated combined t-SNE with normalization
print("=" * 80)
print("METHOD 4: Combined t-SNE with Normalization (Sequential)")
print("=" * 80)
print("Watch as each model group animates from origin (0,0) to their positions, one at a time!\n")

animated_fig = visualize_multiple_embeddings_animated(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes=quotes,
    method="tsne",
    num_frames=120,  # Increased from 60 to 120 for smoother, slower animation
    frame_duration=50,
    loop=False,
    sequential=True,  # Enable sequential animation - one group at a time
    highlight_quote_idx=10,  # Highlight same quote across all models
)

### Save Visualizations as High-Resolution PNGs

Now let's save all three visualizations as PNG files for external use.

In [None]:
def save_visualization_as_png(
    embeddings_list: list[ndarray],
    model_names: list[str],
    quotes: list[str],
    method: str,
    filename: str,
    viz_type: str = "combined",
    shared_axes: bool = True,
):
    """Save a visualization as a high-resolution PNG file.

    Args:
        embeddings_list (list[ndarray]): A list of embeddings to visualize.
        model_names (list[str]): A list of model names corresponding to the embeddings.
        quotes (list[str]): Original quote texts.
        method (str): Dimensionality reduction method - "pca" or "tsne"
        filename (str): Output filename (without extension)
        viz_type (str): "combined" or "separate"
        shared_axes (bool): If True and viz_type="separate", use shared axes
    """

    if viz_type == "combined":
        # Normalize embeddings
        normalized_embeddings = []
        for emb in embeddings_list:
            normalized_embeddings.append(normalize_embeddings(emb))

        combined_embeddings = np.vstack(normalized_embeddings)

        # Apply dimensionality reduction
        if method.lower() == "pca":
            reducer = PCA(n_components=2)
            reduced_embeddings = reducer.fit_transform(combined_embeddings)
            axis_labels = {"x": "Principal Component 1", "y": "Principal Component 2"}
            title_method = "PCA"
        elif method.lower() == "tsne":
            reducer = TSNE(n_components=2, random_state=42, perplexity=30)
            reduced_embeddings = reducer.fit_transform(combined_embeddings)
            axis_labels = {"x": "t-SNE Dimension 1", "y": "t-SNE Dimension 2"}
            title_method = "t-SNE"

        df = pd.DataFrame(reduced_embeddings, columns=["dim1", "dim2"])
        df["Model"] = np.repeat(model_names, [emb.shape[0] for emb in embeddings_list])
        all_quotes = quotes * len(embeddings_list)
        # Truncate quotes to 50 characters for hover display
        df["Quote"] = [q[:50] + "..." if len(q) > 50 else q for q in all_quotes]

        fig = px.scatter(
            df,
            x="dim1",
            y="dim2",
            color="Model",
            color_discrete_sequence=px.colors.qualitative.Vivid,
            labels={"dim1": axis_labels["x"], "dim2": axis_labels["y"]},
            hover_data={"Quote": True, "Model": True, "dim1": ":.3f", "dim2": ":.3f"},
        )

        fig.update_traces(marker=dict(size=6, opacity=0.7))
        fig.update_layout(
            title={
                "text": f"<b>2D Visualization of Embeddings ({title_method}, Normalized)</b>",
                "x": 0.5,
                "xanchor": "center",
                "font": {"size": 18, "family": "Arial, sans-serif"},
            },
            font={"family": "Arial, sans-serif", "size": 12},
            width=1200,
            height=600,
        )

    else:  # separate
        n_models = len(embeddings_list)
        fig = make_subplots(
            rows=1, cols=n_models, subplot_titles=model_names, horizontal_spacing=0.05
        )

        # Compute all reductions
        all_reduced = []
        all_explained_vars = []

        for embeddings in embeddings_list:
            if method.lower() == "pca":
                reducer = PCA(n_components=2)
                reduced = reducer.fit_transform(embeddings)
                explained_var = reducer.explained_variance_ratio_.sum()
                all_explained_vars.append(explained_var)
            elif method.lower() == "tsne":
                reducer = TSNE(n_components=2, random_state=42)
                reduced = reducer.fit_transform(embeddings)
                all_explained_vars.append(None)

            all_reduced.append(reduced)

        # Compute shared axis ranges if requested
        if shared_axes:
            all_x = np.concatenate([r[:, 0] for r in all_reduced])
            all_y = np.concatenate([r[:, 1] for r in all_reduced])
            x_min, x_max = all_x.min(), all_x.max()
            y_min, y_max = all_y.min(), all_y.max()
            x_padding = (x_max - x_min) * 0.05
            y_padding = (y_max - y_min) * 0.05
            x_range = [x_min - x_padding, x_max + x_padding]
            y_range = [y_min - y_padding, y_max + y_padding]

        # Create plots
        for i, (reduced, model_name, explained_var) in enumerate(
            zip(all_reduced, model_names, all_explained_vars)
        ):
            if explained_var is not None:
                subtitle_suffix = f"<br>(Explained var: {explained_var:.1%})"
            else:
                subtitle_suffix = ""

            # Truncate quotes to 50 characters for hover display
            hover_text = [
                f"Quote: {q[:50]}..." if len(q) > 50 else f"Quote: {q}"
                for q in quotes
            ]

            fig.add_trace(
                go.Scatter(
                    x=reduced[:, 0],
                    y=reduced[:, 1],
                    mode="markers",
                    marker=dict(size=5, opacity=0.6),
                    text=hover_text,
                    hovertemplate="%{text}<br>x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>",
                    showlegend=False,
                ),
                row=1,
                col=i + 1,
            )

            fig.layout.annotations[i].update(
                text=model_name + subtitle_suffix,
                font=dict(family="Arial, sans-serif", size=14),
            )

            if shared_axes:
                fig.update_xaxes(range=x_range, row=1, col=i + 1)
                fig.update_yaxes(range=y_range, row=1, col=i + 1)

        method_name = "PCA" if method.lower() == "pca" else "t-SNE"
        axes_note = " (Shared Axes)" if shared_axes else " (Independent Axes)"

        fig.update_layout(
            title={
                "text": f"<b>Separate {method_name} per Model{axes_note}</b>",
                "x": 0.5,
                "xanchor": "center",
                "font": {"size": 18, "family": "Arial, sans-serif"},
            },
            font={"family": "Arial, sans-serif", "size": 12},
            width=1200,
            height=425,
            showlegend=False,
        )

    # Save as PNG with high resolution (scale=3 means 3x the size)
    output_path = f"visualizations/{filename}.png"
    os.makedirs("visualizations", exist_ok=True)
    fig.write_image(output_path, scale=3)
    print(f"Saved: {output_path}")

    return fig

In [None]:
# Save all three visualizations as high-resolution PNGs
print("Saving visualizations to the 'visualizations' folder...\n")

# 1. Combined PCA with normalization
print("1. Combined PCA (Normalized)...")
save_visualization_as_png(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes,
    method="pca",
    filename="combined_pca_normalized",
    viz_type="combined",
)

# 2. Separate PCA per model (with shared axes)
print("\n2. Separate PCA per Model (Shared Axes)...")
save_visualization_as_png(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes,
    method="pca",
    filename="separate_pca_shared_axes",
    viz_type="separate",
    shared_axes=False,
)

# 3. Combined t-SNE with normalization
print("\n3. Combined t-SNE (Normalized)...")
save_visualization_as_png(
    [embeddings_google, embeddings_qwen, embeddings_ibm, embeddings_tencent],
    ["Google EmbeddingGemma", "Qwen3 Embedding", "IBM Granite", "Tencent Conan"],
    quotes,
    method="tsne",
    filename="combined_tsne_normalized",
    viz_type="combined",
)

print("\n" + "=" * 80)
print("All visualizations saved successfully!")
print("=" * 80)
print("\nOutput files:")
print("  - visualizations/combined_pca_normalized.png")
print("  - visualizations/separate_pca_shared_axes.png")
print("  - visualizations/combined_tsne_normalized.png")
print("\nAll images are saved at 3x resolution for high quality.")

In [None]:
# Save the animated visualization as an interactive HTML file and animated GIF
print("Saving animated visualization...\n")

# Save as HTML
html_path = "visualizations/combined_tsne_normalized_animated.html"
os.makedirs("visualizations", exist_ok=True)
animated_fig.write_html(html_path)
print(f"HTML saved: {html_path}")

# Save as animated GIF
# Note: This requires kaleido and PIL/Pillow packages
print("\nGenerating animated GIF (this may take a moment)...")

try:
    from PIL import Image
    import io
    
    # Create a clean layout for GIF (without controls but WITH annotations)
    gif_layout = go.Layout(
        title={
            "text": animated_fig.layout.title.text,
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        xaxis={"title": animated_fig.layout.xaxis.title.text, "range": animated_fig.layout.xaxis.range},
        yaxis={"title": animated_fig.layout.yaxis.title.text, "range": animated_fig.layout.yaxis.range},
        font={"family": "Arial, sans-serif", "size": 12},
        width=1200,
        height=600,
        showlegend=True,
        annotations=list(animated_fig.layout.annotations) if animated_fig.layout.annotations else [],
        # No updatemenus or sliders for clean GIF
    )
    
    # Extract frames from the Plotly figure
    gif_frames = []
    num_frames = len(animated_fig.frames)
    
    print(f"Processing {num_frames} frames...")
    
    # Create each frame as an image
    for i in range(num_frames):
        # Create a figure with the clean layout (no controls but with annotations)
        frame_fig = go.Figure(
            data=animated_fig.frames[i].data,
            layout=gif_layout
        )
        
        # Convert to image bytes
        img_bytes = frame_fig.to_image(format="png", width=1200, height=600, scale=2)
        img = Image.open(io.BytesIO(img_bytes))
        gif_frames.append(img)
        
        if (i + 1) % 10 == 0:
            print(f"  Processed {i + 1}/{num_frames} frames...")
    
    # Save as animated GIF
    # Omit the loop parameter to play once without repeating
    # (loop=0 means infinite, loop=1 means play twice)
    gif_path = "visualizations/combined_tsne_normalized_animated.gif"
    gif_frames[0].save(
        gif_path,
        save_all=True,
        append_images=gif_frames[1:],
        duration=33,  # 33ms per frame = ~30 FPS
        optimize=False  # Set to True to reduce file size (but slower)
        # No loop parameter = play once (behavior may vary by viewer)
    )
    
    print(f"\nGIF saved: {gif_path}")
    print(f"  - Frames: {num_frames}")
    print(f"  - Frame rate: ~30 FPS (33ms per frame)")
    print(f"  - Looping: No loop parameter (plays once in most viewers)")
    print(f"  - Controls: Hidden (clean animation)")
    print(f"  - Annotations: Included in all frames")
    
    # Get file size
    import os
    file_size = os.path.getsize(gif_path) / (1024 * 1024)  # Convert to MB
    print(f"  - File size: {file_size:.2f} MB")
    
except ImportError as e:
    print(f"Error: Missing required packages for GIF export.")
    print(f"Please install: pip install pillow kaleido")
    print(f"Details: {e}")
except Exception as e:
    print(f"Error creating GIF: {e}")

print("\n" + "=" * 80)
print("Visualization files created:")
print("=" * 80)
print(f"  - HTML: {html_path}")
print(f"    (Interactive, with play/pause controls)")
if 'gif_path' in locals():
    print(f"  - GIF: {gif_path}")
    print(f"    (Clean animation with annotation, plays once)")

## Multi-Dimension Embedding Generation

The `generate_embeddings_multi_dimension()` function allows you to generate embeddings at multiple dimensions (128, 256, 512, 768) in a single call. This is useful for comparing how different dimension sizes affect the embedding space and model performance.

**Features:**
- Generates embeddings at multiple specified dimensions
- Saves all embeddings to a dedicated `dimension_tests/` folder
- Returns a dictionary mapping dimensions to file paths
- Provides progress feedback during generation

**Note:** The default dimensions are [128, 256, 512, 768] to match the model's maximum supported dimensions.

In [None]:
def visualize_dimensions_separately(
    embeddings_dict: dict[int, ndarray],
    model_name: str,
    quotes: list[str] = None,
    method: str = "pca",
    shared_axes: bool = True,
):
    """Visualize embeddings at different dimensions with separate dimensionality reduction per dimension.

    This approach applies PCA/t-SNE independently to each dimension's embeddings,
    showing how the embedding space structure varies across dimensions.
    Dimensions are displayed side-by-side in subplots for comparison.

    Args:
        embeddings_dict (dict[int, ndarray]): Dictionary mapping dimension sizes to embeddings arrays.
        model_name (str): The name of the model (for the title).
        quotes (list[str], optional): Original quote texts for hover display.
        method (str): Dimensionality reduction method - "pca" or "tsne"
        shared_axes (bool): If True, all subplots use the same x/y axis ranges for direct comparison
    """
    # Sort dimensions for consistent ordering
    dimensions = sorted(embeddings_dict.keys())
    n_dims = len(dimensions)
    
    # Create subplot titles with dimension info
    subplot_titles = [f"{dim} Dimensions" for dim in dimensions]
    
    fig = make_subplots(
        rows=1, cols=n_dims, subplot_titles=subplot_titles, horizontal_spacing=0.05
    )

    # First pass: compute all reductions and find global ranges if needed
    all_reduced = []
    all_explained_vars = []

    for dim in dimensions:
        embeddings = embeddings_dict[dim]
        
        # Apply dimensionality reduction independently
        if method.lower() == "pca":
            reducer = PCA(n_components=2)
            reduced = reducer.fit_transform(embeddings)
            explained_var = reducer.explained_variance_ratio_.sum()
            all_explained_vars.append(explained_var)
        elif method.lower() == "tsne":
            reducer = TSNE(n_components=2, random_state=42)
            reduced = reducer.fit_transform(embeddings)
            all_explained_vars.append(None)
        else:
            raise ValueError(f"Unknown method: {method}")

        all_reduced.append(reduced)

    # Compute shared axis ranges if requested
    if shared_axes:
        all_x = np.concatenate([r[:, 0] for r in all_reduced])
        all_y = np.concatenate([r[:, 1] for r in all_reduced])
        x_min, x_max = all_x.min(), all_x.max()
        y_min, y_max = all_y.min(), all_y.max()
        # Add small padding (5%)
        x_padding = (x_max - x_min) * 0.05
        y_padding = (y_max - y_min) * 0.05
        x_range = [x_min - x_padding, x_max + x_padding]
        y_range = [y_min - y_padding, y_max + y_padding]

    # Second pass: create plots
    for i, (dim, reduced, explained_var) in enumerate(
        zip(dimensions, all_reduced, all_explained_vars)
    ):
        # Prepare subtitle with explained variance
        if explained_var is not None:
            subtitle_suffix = f"<br>(Explained var: {explained_var:.1%})"
        else:
            subtitle_suffix = ""

        # Prepare hover text with truncated quotes
        if quotes is not None:
            hover_text = [
                f"Quote: {q[:50]}..." if len(q) > 50 else f"Quote: {q}"
                for q in quotes
            ]
        else:
            hover_text = None

        # Add scatter trace
        fig.add_trace(
            go.Scatter(
                x=reduced[:, 0],
                y=reduced[:, 1],
                mode="markers",
                marker=dict(size=5, opacity=0.6, color=px.colors.qualitative.Vivid[i % len(px.colors.qualitative.Vivid)]),
                text=hover_text,
                hovertemplate="%{text}<br>x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>",
                showlegend=False,
            ),
            row=1,
            col=i + 1,
        )

        # Update subplot title with explained variance and font
        fig.layout.annotations[i].update(
            text=f"{dim} Dimensions{subtitle_suffix}",
            font=dict(family="Arial, sans-serif", size=14),
        )

        # Set axis ranges
        if shared_axes:
            fig.update_xaxes(range=x_range, row=1, col=i + 1)
            fig.update_yaxes(range=y_range, row=1, col=i + 1)

    method_name = "PCA" if method.lower() == "pca" else "t-SNE"
    axes_note = " (Shared Axes)" if shared_axes else " (Independent Axes)"

    # Make title bold and centered, set Arial font, and increase resolution
    fig.update_layout(
        title={
            "text": f"<b>{model_name}: Separate {method_name} per Dimension{axes_note}</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        font={"family": "Arial, sans-serif", "size": 12},
        width=1600,
        height=400,
        showlegend=False,
    )

    # Show with high resolution
    fig.show(config={"toImageButtonOptions": {"format": "png", "scale": 3}})

In [None]:
def save_dimension_visualization_as_png(
    embeddings_dict: dict[int, ndarray],
    model_name: str,
    quotes: list[str],
    method: str,
    filename: str,
    shared_axes: bool = True,
):
    """Save a dimension comparison visualization as a high-resolution PNG file.

    Args:
        embeddings_dict (dict[int, ndarray]): Dictionary mapping dimension sizes to embeddings arrays.
        model_name (str): The name of the model (for the title).
        quotes (list[str]): Original quote texts.
        method (str): Dimensionality reduction method - "pca" or "tsne"
        filename (str): Output filename (without extension)
        shared_axes (bool): If True, all subplots use the same x/y axis ranges
    """
    # Sort dimensions for consistent ordering
    dimensions = sorted(embeddings_dict.keys())
    n_dims = len(dimensions)
    
    # Create subplot titles
    subplot_titles = [f"{dim} Dimensions" for dim in dimensions]
    
    fig = make_subplots(
        rows=1, cols=n_dims, subplot_titles=subplot_titles, horizontal_spacing=0.05
    )

    # First pass: compute all reductions and find global ranges if needed
    all_reduced = []
    all_explained_vars = []

    for dim in dimensions:
        embeddings = embeddings_dict[dim]
        
        if method.lower() == "pca":
            reducer = PCA(n_components=2)
            reduced = reducer.fit_transform(embeddings)
            explained_var = reducer.explained_variance_ratio_.sum()
            all_explained_vars.append(explained_var)
        elif method.lower() == "tsne":
            reducer = TSNE(n_components=2, random_state=42)
            reduced = reducer.fit_transform(embeddings)
            all_explained_vars.append(None)
        else:
            raise ValueError(f"Unknown method: {method}")

        all_reduced.append(reduced)

    # Compute shared axis ranges if requested
    if shared_axes:
        all_x = np.concatenate([r[:, 0] for r in all_reduced])
        all_y = np.concatenate([r[:, 1] for r in all_reduced])
        x_min, x_max = all_x.min(), all_x.max()
        y_min, y_max = all_y.min(), all_y.max()
        x_padding = (x_max - x_min) * 0.05
        y_padding = (y_max - y_min) * 0.05
        x_range = [x_min - x_padding, x_max + x_padding]
        y_range = [y_min - y_padding, y_max + y_padding]

    # Second pass: create plots
    for i, (dim, reduced, explained_var) in enumerate(
        zip(dimensions, all_reduced, all_explained_vars)
    ):
        if explained_var is not None:
            subtitle_suffix = f"<br>(Explained var: {explained_var:.1%})"
        else:
            subtitle_suffix = ""

        hover_text = [
            f"Quote: {q[:50]}..." if len(q) > 50 else f"Quote: {q}"
            for q in quotes
        ]

        fig.add_trace(
            go.Scatter(
                x=reduced[:, 0],
                y=reduced[:, 1],
                mode="markers",
                marker=dict(size=5, opacity=0.6, color=px.colors.qualitative.Vivid[i % len(px.colors.qualitative.Vivid)]),
                text=hover_text,
                hovertemplate="%{text}<br>x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>",
                showlegend=False,
            ),
            row=1,
            col=i + 1,
        )

        fig.layout.annotations[i].update(
            text=f"{dim} Dimensions{subtitle_suffix}",
            font=dict(family="Arial, sans-serif", size=14),
        )

        if shared_axes:
            fig.update_xaxes(range=x_range, row=1, col=i + 1)
            fig.update_yaxes(range=y_range, row=1, col=i + 1)

    method_name = "PCA" if method.lower() == "pca" else "t-SNE"
    axes_note = " (Shared Axes)" if shared_axes else " (Independent Axes)"

    fig.update_layout(
        title={
            "text": f"<b>{model_name}: Separate {method_name} per Dimension{axes_note}</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 18, "family": "Arial, sans-serif"},
        },
        font={"family": "Arial, sans-serif", "size": 12},
        width=1600,
        height=500,
        showlegend=False,
    )

    # Save as PNG with high resolution
    output_path = f"visualizations/{filename}.png"
    os.makedirs("visualizations", exist_ok=True)
    fig.write_image(output_path, scale=3)
    print(f"Saved: {output_path}")

    return fig

In [None]:
# Example: Generate embeddings at multiple dimensions for Google EmbeddingGemma model

dimension_results = generate_embeddings_multi_dimension(
    model=google_model,
    model_name="google-embedding-gemma-300m",
    quotes=quotes,
    dimensions=[128, 256, 512, 768]
)

print("\n" + "=" * 80)
print("Generated files:")
for dim, path in dimension_results.items():
    print(f"  {dim} Dimensions: {path}")

In [None]:
# Example: Load and verify embeddings from dimension_tests folder

print("Loading embeddings from dimension_tests folder...")
print("=" * 80)

dimensions_to_check = [128, 256, 512, 768]
model_name = "google-embedding-gemma-300m"

for dim in dimensions_to_check:
    file_path = f"dimension_tests/{model_name}-{dim}.npy"
    embeddings = load_embeddings(file_path)
    print(f"\nDimension {dim}:")
    print(f"  Shape: {embeddings.shape}")
    print(f"  Expected: (200, {dim})")
    print(f"  Match: {embeddings.shape == (200, dim)}")

In [None]:
# Example: Visualize embeddings at different dimensions side-by-side

# Load embeddings for all dimensions
embeddings_128 = load_embeddings("dimension_tests/google-embedding-gemma-300m-128.npy")
embeddings_256 = load_embeddings("dimension_tests/google-embedding-gemma-300m-256.npy")
embeddings_512 = load_embeddings("dimension_tests/google-embedding-gemma-300m-512.npy")
embeddings_768 = load_embeddings("dimension_tests/google-embedding-gemma-300m-768.npy")

# Create dictionary mapping dimensions to embeddings
embeddings_by_dimension = {
    128: embeddings_128,
    256: embeddings_256,
    512: embeddings_512,
    768: embeddings_768,
}

# Visualize with PCA
print("Visualizing different dimensions side-by-side using PCA...")
visualize_dimensions_separately(
    embeddings_dict=embeddings_by_dimension,
    model_name="Google EmbeddingGemma 300m",
    quotes=quotes,
    method="pca",
    shared_axes=True,
)

In [None]:
# Example: Visualize different dimensions using t-SNE (alternative method)

print("Visualizing different dimensions side-by-side using t-SNE...")
print("Note: t-SNE may take longer to compute but preserves local structure better\n")

visualize_dimensions_separately(
    embeddings_dict=embeddings_by_dimension,
    model_name="Google EmbeddingGemma 300m",
    quotes=quotes,
    method="tsne",
    shared_axes=True,
)

In [None]:
# Save dimension comparison visualizations as high-resolution PNGs
print("Saving dimension comparison visualizations...\n")

# 1. Save PCA dimension comparison
print("1. Dimension Comparison - PCA (Shared Axes)...")
save_dimension_visualization_as_png(
    embeddings_dict=embeddings_by_dimension,
    model_name="Google EmbeddingGemma 300m",
    quotes=quotes,
    method="pca",
    filename="dimension_comparison_pca_shared_axes",
    shared_axes=True,
)

# 2. Save t-SNE dimension comparison
print("\n2. Dimension Comparison - t-SNE (Shared Axes)...")
save_dimension_visualization_as_png(
    embeddings_dict=embeddings_by_dimension,
    model_name="Google EmbeddingGemma 300m",
    quotes=quotes,
    method="tsne",
    filename="dimension_comparison_tsne_shared_axes",
    shared_axes=True,
)

print("\n" + "=" * 80)
print("Dimension comparison visualizations saved successfully!")
print("=" * 80)
print("\nOutput files:")
print("  - visualizations/dimension_comparison_pca_shared_axes.png")
print("  - visualizations/dimension_comparison_tsne_shared_axes.png")
print("\nAll images are saved at 3x resolution for high quality.")