In [None]:
!pip install -U transformers

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained(
    "microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned"
)

## Token

In [None]:
words = ["Lanism", "Helanism", "Hellenism"]
for w in words:
    pieces = tok.tokenize(w)
    ids = tok.convert_tokens_to_ids(pieces)
    print(w, "->", pieces, ids)

## Nearest Neighbours

In [None]:
!pip -q install -U huggingface_hub safetensors torch

import json, torch, torch.nn.functional as F
from huggingface_hub import hf_hub_download
from safetensors.torch import safe_open

In [None]:
index_path = hf_hub_download(repo_id="microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned", filename="model.safetensors.index.json")
with open(index_path, "r") as f:
    index = json.load(f)

CAND_KEYS = [
    "model.embed_tokens.weight",
    "model.model.embed_tokens.weight",
    "text_model.embed_tokens.weight",
    "embed_tokens.weight",
    "model.decoder.embed_tokens.weight",
    "lm_head.weight"
]

target_key = next((k for k in CAND_KEYS if k in index["weight_map"]), None)
assert target_key is not None, "Cannot find embedding weight key in index.json"
shard_file = index["weight_map"][target_key]
shard_path = hf_hub_download(repo_id="microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned", filename=shard_file)


with safe_open(shard_path, framework="pt", device="cpu") as f:
    E = f.get_tensor(target_key)  # [V, D]
print("Embedding shape:", tuple(E.shape), E.dtype)


E16 = E.half()
E = None  # release

def nn_for_token_str(tok, token_str, k=10):
    tid = tok.convert_tokens_to_ids(token_str)
    if tid == tok.unk_token_id:
        print(f"[warn] '{token_str}' is <unk> for this tokenizer.")
        return []
    v = E16[tid].float().unsqueeze(0)           # [1, D] -> float32
    v = F.normalize(v, dim=-1)
    emb_norm = F.normalize(E16.float(), dim=-1) # [V, D] float32
    sims = torch.matmul(v, emb_norm.t()).squeeze(0)  # [V]
    topk = torch.topk(sims, k=k)
    ids = topk.indices.tolist()
    vals = topk.values.tolist()
    return [(tok.convert_ids_to_tokens(i), float(s)) for i, s in zip(ids, vals)]

targets = ["Hel", "anism"]
for t in targets:
    nns = nn_for_token_str(tok, t, k=10)
    if not nns:
        continue
    print(f"\nTop-10 neighbors for '{t}':")
    for nn_tok, score in nns:
        print(f"  {nn_tok:>20s}  {score:.4f}")