In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

tokenizer:PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_id)

In [None]:
len(tokenizer.vocab.keys())

In [None]:
tokens = tokenizer.convert_ids_to_tokens(list(range(128256)))
tokens[:24]

In [None]:
tokens[-256:-240]

In [None]:
tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)

In [None]:
tokenizer.encode(" ", add_special_tokens=False)

In [None]:
tokens[9906]

In [None]:
tokenizer.tokenize("Hello, my dog is cute")

In [None]:
from IPython.display import display, HTML


def fix_token(token: str):
    """Fix token for display."""
    if token.startswith('Ġ'):
        # Count number of Ġ characters
        space_count = token.count('Ġ')
        # Replace with middle dot (·) for each space
        return '·' * space_count + token[space_count:]

    return token

def visualize_tokenization(text: str, tokenizer=tokenizer, monospace=False):
    """Visualize tokenization of a text."""
    tokens = tokenizer.tokenize(text)
    palette = ["#FFB6C1", "#87CEFA", "#98FB98", "#FFDAB9", "#E6E6FA", "#FFDEAD", "#FFE4B5"]

    # Set font family based on monospace parameter
    font_family = "'Courier New', Courier, monospace" if monospace else "inherit"

    token_spans = []
    line_break = False
    for i, token in enumerate(tokens):
        fixed_token = fix_token(token)

        # Skip wrapping newline tokens in colored spans
        if fixed_token.endswith('Ċ'):
            fixed_token = fixed_token[:-1]
            line_break = True

        token_spans.append(
            f'<span style="background-color: {palette[i % len(palette)]}; '
            f'color: black; padding: 2px 1px; border-radius: 0px; '
            f'display: inline-block; font-family: {font_family};">'
            f'{fixed_token}</span>'
        )
        if line_break:
            token_spans.append('<br>')
            line_break = False

    html_content = "".join(token_spans)
    display(HTML(html_content))

In [None]:
visualize_tokenization("Hello, my dog is cute")

In [None]:
visualize_tokenization("The tokenization process is fun. Superfun")

In [None]:
visualize_tokenization("""Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.""", monospace=True)

In [None]:
visualize_tokenization('''def fetch(url):
    """Download a file and save it to the data directory."""
    file_path = os.path.join("data", os.path.basename(url))
    if os.path.exists(file_path):
        return None, None
    data = request.urlopen(url).read()
    return file_path, data
''', monospace=True)

In [None]:
visualize_tokenization('''#include <stdio.h>
int main(void)
{
	printf("Hello World!\n");
    return 0;
}''', monospace=True)

In [None]:
from transformers import AutoTokenizer
deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")

In [None]:
len(deepseek_tokenizer.vocab.keys())

In [None]:
deepseek_tokenizer.convert_ids_to_tokens(list(range(128815-20, 128815)))

In [None]:
phi4_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-4")
len(phi4_tokenizer.vocab.keys())

In [None]:
import transformers
from transformers import LlamaForCausalLM

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)

model: LlamaForCausalLM = pipeline.model

# Access the embedding layer
embedding_layer = model.get_input_embeddings()

In [None]:
# Get the full embedding matrix
embedding_matrix = embedding_layer.weight.detach().cpu().numpy()

embedding_matrix.shape

In [None]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(metric="cosine").fit(embedding_matrix)

In [None]:
def find_closest(token: str, n_neighbors: int = 6, threshold: float = 0.5):
    token_id = tokenizer.convert_tokens_to_ids(token)
    distance, indices = nn.kneighbors(embedding_matrix[token_id].reshape(1, -1), n_neighbors=n_neighbors)
    closest_tokens =  indices[0]
    # remove tokens farther than .5 cosine distance
    closest_tokens = [i for i, d in zip(closest_tokens, distance[0]) if d < threshold]
    # remove the original token and convert the ids to tokens
    return [tokenizer.convert_ids_to_tokens([i])[0] for i in closest_tokens if i != token_id]

In [None]:
find_closest("Ġwoman")

In [None]:
married_embedding, woman_embedding, male_embedding = embedding_matrix[tokenizer.convert_tokens_to_ids(["Ġmarried", "Ġwoman", "Ġdoor"])]

In [None]:
def find_closest_by_embedding(embedding, n_neighbors=6):
    _, indices = nn.kneighbors(embedding.reshape(1, -1), n_neighbors=n_neighbors)
    closest_tokens =  indices[0]
    return [tokenizer.convert_ids_to_tokens([i])[0] for i in closest_tokens]

In [None]:
find_closest_by_embedding(married_embedding)

In [None]:
import numpy as np

difference = woman_embedding + married_embedding
# difference = difference / np.linalg.norm(difference)

In [None]:
king, man, woman = embedding_matrix[tokenizer.convert_tokens_to_ids(["Ġking", "Ġman", "Ġwoman"])]

In [None]:
import numpy as np

def spherical_subtraction(minuend, subtrahend, base=None, eps=1e-10):
    # Compute the dot product and clip to avoid numerical issues ([NumPy.clip](https://numpy.org/doc/stable/reference/generated/numpy.clip.html))
    dot = np.dot(minuend, subtrahend)
    dot = np.clip(dot, -1.0, 1.0)
    theta = np.arccos(dot)

    # Compute the spherical logarithm map (geodesic difference) from minuend to subtrahend
    if theta < eps:
        diff = np.zeros_like(minuend)
    else:
        diff = (theta / np.sin(theta)) * (subtrahend - dot * minuend)

    if base is None:
        return diff
    else:
        norm_diff = np.linalg.norm(diff)
        if norm_diff < eps:
            return base
        # Apply the spherical exponential map to rotate the base vector ([Exponential map (Riemannian geometry)](https://en.wikipedia.org/wiki/Exponential_map_(Riemannian_geometry)))
        return np.cos(norm_diff)*base + np.sin(norm_diff)*(diff/norm_diff)


In [None]:
find_closest_by_embedding(spherical_subtraction(king, man, woman))