In [1]:
import numpy as np
import umap.umap_ as umap
import plotly.express as px
import warnings
from transformers import GPT2Tokenizer, GPT2Model

In [2]:
# 1. Load GPT-2 tokenizer and model (smallest 'gpt2' base)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [3]:
# 2. Extract token embeddings: (vocab_size, embedding_dim)
embeddings = model.get_input_embeddings().weight.data.cpu().numpy()

In [4]:
# 3. Get token frequencies to select top N tokens (approximate by tokenizer vocab order)
# Note: GPT-2 vocab is roughly ordered by frequency
top_n = 3000
selected_embeddings = embeddings[:top_n]

In [5]:
# 4. Token texts for hover labels
tokens = [tokenizer.decode([i]).strip() for i in range(top_n)]

In [6]:
# 5. Dimensionality reduction with UMAP
warnings.filterwarnings("ignore", message="n_jobs value 1 overridden")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', n_components=3, random_state=42)
emb_3d = reducer.fit_transform(selected_embeddings)

In [7]:
# 6. Build interactive scatter plot with Plotly
token_lengths = [len(tok) for tok in tokens]
fig = px.scatter_3d(
    x=emb_3d[:, 0],
    y=emb_3d[:, 1],
    z=emb_3d[:, 2],
    hover_name=tokens,
    color=token_lengths,
    color_continuous_scale='Viridis',
    opacity=0.7
)

# Make points smaller and sharper
fig.update_traces(marker=dict(size=3, opacity=0.8))

# Dark theme
fig.update_layout(
    scene=dict(
        xaxis=dict(showbackground=False),
        yaxis=dict(showbackground=False),
        zaxis=dict(showbackground=False)
    ),
    paper_bgcolor="black",
    plot_bgcolor="black",
    font_color="white"
)

# Nice starting camera view
fig.update_layout(scene_camera=dict(
    eye=dict(x=1.5, y=1.5, z=1.5)
))

fig.show(renderer="browser")