**DeepSeek-R1-Distill-Qwen-7B**

In [7]:
import torch

def frobenius_distance(A, B):
    # Returns the Euclidean distance between two matrices
    return torch.linalg.matrix_norm(A - B, ord='fro')

In [8]:
import torch
import os
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

#should try other maybe larger models
tokenizer_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model_name = tokenizer_name

# Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained(model_name)

# extract embedding
embeddings = model.model.embed_tokens.weight

# extract unembedding
unembedding = model.lm_head.weight

print(f"Embedding matrix Shape:   {embeddings.shape}")
print(f"Unembedding matrix Shape: {unembedding.shape}")

#comparing the embedding matrix and the unembedding matrix
if embeddings.shape == unembedding.t().shape:
    print("- The matrices have transposed shapes.")
elif embeddings.shape == unembedding.shape:
    print("- The matrices have the SAME shape (not transposed).")
else:
    print("- The matrices have different shapes.")

if embeddings is unembedding:
    print("- Weights are TIED (Same object in memory).")
elif torch.allclose(embeddings, unembedding, atol=1e-5):
    print("- Weights are distinct objects but are the exact same.")
else:
    print("- Weights are UNTIED and distinct.")

print(f"DeepSeek-R1-Distill-Qwen-7B Config 'tie_word_embeddings':     {model.config.tie_word_embeddings}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding matrix Shape:   torch.Size([152064, 3584])
Unembedding matrix Shape: torch.Size([152064, 3584])
- The matrices have the SAME shape (not transposed).
- Weights are UNTIED and distinct.
DeepSeek-R1-Distill-Qwen-7B Config 'tie_word_embeddings':     False


In [9]:
distance = frobenius_distance(embeddings, unembedding)
print(f"The frobenius distance from the embedding to the unembedding matrix of DeepSeek-R1-Distill-Qwen-7B is {distance}.")

emb_mat_norm = torch.linalg.matrix_norm(embeddings, ord='fro')
rel_distance = distance/emb_mat_norm

print(f"The relative frobenius distance from the embedding to the unembedding matrix of DeepSeek-R1-Distill-Qwen-7B is {rel_distance}.")

The frobenius distance from the embedding to the unembedding matrix of DeepSeek-R1-Distill-Qwen-7B is 534.2822875976562.
The relative frobenius distance from the embedding to the unembedding matrix of DeepSeek-R1-Distill-Qwen-7B is 1.3745619058609009.


In [None]:
#ALWAYS RUN THIS CELL FIRST
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from load_embeddings import load_embedding_model
from utils import find_similar_embeddings, prompt_to_embeddings, find_similar_logits, get_token_embedding, token_len_one_verifier, test_combinations

model, tokenizer = load_embedding_model(152064, 3584, "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B_embeddings_qwen.pth", "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B_unembeddings_qwen.pth", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SimpleModel(
  (embedding): Embedding(152064, 3584)
  (unembedding): Linear(in_features=3584, out_features=152064, bias=False)
)

**Colors**

In [None]:
#Getting one token colors
colors = ["red", "blue", "green", "yellow", "orange", "purple", "pink", "brown", "black", "white", "gray", "cyan", "gold", "silver", "cream", "tan", "amber", "azure", "rose", "ruby"]

one_tok_colors = []
for color in colors:
    if token_len_one_verifier(tokenizer, color):
        one_tok_colors.append(color)

print(one_tok_colors)
print(len(colors))
print(len(one_tok_colors))

#Getting the embeddings
color_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_colors}

['red', 'blue', 'green', 'yellow', 'orange', 'purple', 'pink', 'brown', 'black', 'white', 'gray', 'cyan', 'gold', 'silver', 'cream', 'tan', 'amber', 'azure', 'rose', 'ruby']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_colors, color_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.0


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Animals**

In [None]:
#Getting one token animals
animals = ["cat", "dog", "lion", "bear", "wolf", "fox", "cow", "pig", "horse", "deer", "mouse", "rat", "rabbit", "monkey", "snake", "frog", "duck", "hawk", "fish", "owl"]

one_tok_animals = []
for animal in animals:
    if token_len_one_verifier(tokenizer, animal):
        one_tok_animals.append(animal)

print(one_tok_animals)
print(len(animals))
print(len(one_tok_animals))

#Getting the embeddings
animal_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_animals}

['cat', 'dog', 'lion', 'bear', 'wolf', 'fox', 'cow', 'pig', 'horse', 'deer', 'mouse', 'rat', 'rabbit', 'monkey', 'snake', 'frog', 'duck', 'hawk', 'fish', 'owl']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_animals, animal_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.0


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Common verbs**

In [None]:
#Getting one token verbs
verbs = ["run", "walk", "jump", "sit", "stand", "sleep", "eat", "drink", "read", "write",
 "open", "close", "listen", "watch", "play", "move", "stop", "go", "come",
 "think"]

one_tok_verbs = []
for verb in verbs:
    if token_len_one_verifier(tokenizer, verb):
        one_tok_verbs.append(verb)

print(one_tok_verbs)
print(len(verbs))
print(len(one_tok_verbs))

#Getting the embeddings
verb_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_verbs}

['run', 'walk', 'jump', 'sit', 'stand', 'sleep', 'eat', 'drink', 'read', 'write', 'open', 'close', 'listen', 'watch', 'play', 'move', 'stop', 'go', 'come', 'think']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_verbs, verb_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.0


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Geometric object**

In [None]:
#Getting one token geometry
geo_objects = ["circle", "square", "triangle", "rectangle", "oval", "cube", "cone", "sphere", "ellipse", "line", "curve", "ring", "dot", "cross", "loop", "polygon", "point", "plane", "edge", "angle"]

one_tok_geo_objects = []
for geo_object in geo_objects:
    if token_len_one_verifier(tokenizer, geo_object):
        one_tok_geo_objects.append(geo_object)

print(one_tok_geo_objects)
print(len(geo_objects))
print(len(one_tok_geo_objects))

#Getting the embeddings
geo_object_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_geo_objects}

['circle', 'square', 'triangle', 'rectangle', 'oval', 'cube', 'cone', 'sphere', 'ellipse', 'line', 'curve', 'ring', 'dot', 'cross', 'loop', 'polygon', 'point', 'plane', 'edge', 'angle']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_geo_objects, geo_object_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.0


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Body parts**

In [None]:
#Getting one token body parts
body_parts = ["head", "hand", "arm", "leg", "foot", "eye", "ear", "mouth", "back", "neck", "finger", "chest", "hip", "hair", "skin", "face", "throat", "brain", "heart", "lung"]
one_tok_body_parts = []
for body_part in body_parts:
    if token_len_one_verifier(tokenizer, body_part):
        one_tok_body_parts.append(body_part)

print(one_tok_body_parts)
print(len(body_parts))
print(len(one_tok_body_parts))

#Getting the embeddings
body_part_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_body_parts}

['head', 'hand', 'arm', 'leg', 'foot', 'eye', 'ear', 'mouth', 'back', 'neck', 'finger', 'chest', 'hip', 'hair', 'skin', 'face', 'throat', 'brain', 'heart', 'lung']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_body_parts, body_part_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.001


Combination size: 4
0.001


Combination size: 5
0.002


Combination size: 6
0.004


Combination size: 7
0.003


Combination size: 8
0.004


Combination size: 9
0.002


Combination size: 10
0.002


Combination size: 11
0.002


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Adjectives**

In [None]:
#Getting one token adjectives
adjectives = ["good", "bad", "new", "old", "big", "small", "long", "short", "high", "low", "wide", "deep", "hot", "cold", "dark", "light", "soft", "hard", "fast", "slow"]
one_tok_adjectives = []
for adjective in adjectives:
    if token_len_one_verifier(tokenizer, adjective):
        one_tok_adjectives.append(adjective)

print(one_tok_adjectives)
print(len(adjectives))
print(len(one_tok_adjectives))

#Getting the embeddings
adjective_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_adjectives}

['good', 'bad', 'new', 'old', 'big', 'small', 'long', 'short', 'high', 'low', 'wide', 'deep', 'hot', 'cold', 'dark', 'light', 'soft', 'hard', 'fast', 'slow']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_adjectives, adjective_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.010526315789473684


Combination size: 3
0.02000000000000001


Combination size: 4
0.023000000000000013


Combination size: 5
0.037000000000000026


Combination size: 6
0.028000000000000018


Combination size: 7
0.03300000000000002


Combination size: 8
0.03300000000000002


Combination size: 9
0.04200000000000003


Combination size: 10
0.04500000000000003


Combination size: 11
0.037000000000000026


Combination size: 12
0.05400000000000004


Combination size: 13
0.04000000000000003


Combination size: 14
0.02900000000000002


Combination size: 15
0.03200000000000002


Combination size: 16
0.016000000000000007


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Pronouns**

In [None]:
#Getting one token pronouns
pronouns = ["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "mine", "his", "its", "ours", "this", "that", "these", "those"]

one_tok_pronouns = []
for pronoun in pronouns:
    if token_len_one_verifier(tokenizer, pronoun):
        one_tok_pronouns.append(pronoun)

print(one_tok_pronouns)
print(len(pronouns))
print(len(one_tok_pronouns))

#Getting the embeddings
pronoun_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_pronouns}

['I', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'mine', 'his', 'its', 'ours', 'this', 'that', 'these', 'those']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_pronouns, pronoun_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.001


Combination size: 4
0.001


Combination size: 5
0.002


Combination size: 6
0.0


Combination size: 7
0.001


Combination size: 8
0.0


Combination size: 9
0.001


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Prepositions & Conjunctions**

In [None]:
#Getting one token prepositions & conjunctions
p_cs = ["of", "in", "on", "at", "to", "for", "from", "with", "by", "up", "down", "out", "over", "and", "or", "but", "so", "if", "than", "as"]

one_tok_p_cs = []
for p_c in p_cs:
    if token_len_one_verifier(tokenizer, p_c):
        one_tok_p_cs.append(p_c)

print(one_tok_p_cs)
print(len(p_cs))
print(len(one_tok_p_cs))

#Getting the embeddings
p_c_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_p_cs}

['of', 'in', 'on', 'at', 'to', 'for', 'from', 'with', 'by', 'up', 'down', 'out', 'over', 'and', 'or', 'but', 'so', 'if', 'than', 'as']
20
20


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_p_cs, p_c_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.0


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



**Random words**

In [None]:
import random
random.seed(42)

#Getting one token prepositions & conjunctions
alls = ["red", "blue", "green", "yellow", "orange", "purple", "pink", "brown", "black", "white", "gray", "cyan", "gold",
        "silver", "cream", "tan", "amber", "azure", "rose", "ruby", "cat", "dog", "lion", "bear", "wolf", "fox", "cow",
        "pig", "horse", "deer", "mouse", "rat", "rabbit", "monkey", "snake", "frog", "duck", "hawk", "fish", "owl", "run",
        "walk", "jump", "sit", "stand", "sleep", "eat", "drink", "read", "write", "open", "close", "listen", "watch", "play",
        "move", "stop", "go", "come", "think", "circle", "square", "triangle", "rectangle", "oval", "cube", "cone", "sphere",
        "ellipse", "line", "curve", "ring", "dot", "cross", "loop", "polygon", "point", "plane", "edge", "angle", "head", "hand",
        "arm", "leg", "foot", "eye", "ear", "mouth", "back", "neck", "finger", "chest", "hip", "hair", "skin", "face", "throat",
        "brain", "heart", "lung", "good", "bad", "new", "old", "big", "small", "long", "short", "high", "low", "wide", "deep",
        "hot", "cold", "dark", "light", "soft", "hard", "fast", "slow", "I", "you", "he", "she", "it", "we", "they", "me", "him",
        "her", "us", "them", "mine", "his", "its", "ours", "this", "that", "these", "those", "of", "in", "on", "at", "to", "for",
        "from", "with", "by", "up", "down", "out", "over", "and", "or", "but", "so", "if", "than", "as"]

alls = sorted(alls)
alls = random.sample(alls, 30)

one_tok_alls = []
for all in alls:
    if token_len_one_verifier(tokenizer, all):
        one_tok_alls.append(all)

print(one_tok_alls)
print(len(alls))
print(len(one_tok_alls))

#Getting the embeddings
all_embeddings = {c: get_token_embedding(model, tokenizer, c) for c in one_tok_alls}

['cube', 'at', 'horse', 'he', 'green', 'dot', 'cream', 'them', 'cold', 'play', 'back', 'azure', 'come', 'good', 'hand', 'snake', 'yellow', 'fox', 'we', 'plane', 'gray', 'read', 'hot', 'amber', 'eat', 'walk', 'mine', 'than', 'ear', 'to']
30
30


In [None]:
#Testing combinations of different sizes from 1 to 20
combination_sizes = list(range(1, 21))
results = test_combinations(model, tokenizer, one_tok_alls, all_embeddings, combination_sizes, 30)


Combination size: 1
0.0


Combination size: 2
0.0


Combination size: 3
0.0


Combination size: 4
0.0


Combination size: 5
0.0


Combination size: 6
0.0


Combination size: 7
0.0


Combination size: 8
0.001


Combination size: 9
0.0


Combination size: 10
0.0


Combination size: 11
0.0


Combination size: 12
0.0


Combination size: 13
0.0


Combination size: 14
0.0


Combination size: 15
0.0


Combination size: 16
0.0


Combination size: 17
0.0


Combination size: 18
0.0


Combination size: 19
0.0


Combination size: 20
0.0



In [None]:
from utils import find_similar_embeddings, prompt_to_embeddings, find_similar_logits, get_token_embedding

embedding = get_token_embedding(model, tokenizer, "red")
similar = find_similar_embeddings(model, tokenizer, embedding, n = 20)
print(similar)

[('red', 1.0000008344650269), (' red', 0.3085389733314514), ('RED', 0.26520079374313354), ('Red', 0.2539021372795105), (' Red', 0.24568769335746765), ('re', 0.21863503754138947), ('.red', 0.21691332757472992), ('红', 0.21409417688846588), ('res', 0.20547159016132355), ('reds', 0.19922350347042084), ('(red', 0.1967708170413971), ('med', 0.19023369252681732), ('blue', 0.1826041340827942), ('black', 0.1825292408466339), ('_red', 0.1798408478498459), (' pred', 0.17737917602062225), ('rel', 0.17565950751304626), ('led', 0.174979105591774), ('cred', 0.17025920748710632), ('ored', 0.16839295625686646)]


In [None]:
grape = get_token_embedding(model, tokenizer, "grape")
orange = get_token_embedding(model, tokenizer, "orange")
apple = get_token_embedding(model, tokenizer, "apple")

weighted_embedding_fruit = 0.33*grape + 0.33*orange + 0.34*apple
top_words = find_similar_logits(model, tokenizer, weighted_embedding_fruit, n=20)

for w, s in top_words:
    print(f"{w:15s}  {s:.4f}")

 communicated    0.0619
 brew            0.0611
 Rich            0.0582
 threaded        0.0577
 Anthrop         0.0570
unts             0.0556
pro              0.0552
(".              0.0551
Monster          0.0535
acy              0.0531
 caster          0.0531
_running         0.0528
Mut              0.0527
 mistaken        0.0520
 dens            0.0517
 basically       0.0515
海拔               0.0515
不合理              0.0514
staff            0.0513
 Sylv            0.0511
