In [None]:
from torch import load, nn, cdist, mean
import torch.nn.functional as f
from transformers import AutoModel, AutoTokenizer

In [4]:
MODEL = "google/gemma-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2b.pth"
DEVICE = "cuda:0" # run on my gpu

In [5]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [6]:
# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']

In [7]:
# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# Create an empty LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size())

# Give the LLM model the weight of extracted embeddings
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2048)
)

In [8]:
# Test the tokenizer layer
emotions = ["happy", "sad", "anxious", "calm", "depressed", "elated"]
emotion_tokens = {emo: tokenizer(emo, return_tensors='pt').input_ids for emo in emotions}
print("tokens: ", emotion_tokens)
print("2: ", tokenizer.decode(2))
print("3243: ", tokenizer.decode(3243))
print("3734: ", tokenizer.decode(3734))

print("521: ", tokenizer.decode(521))
print("840: ", tokenizer.decode(840))

tokens:  {'happy': tensor([[    2, 11896]]), 'sad': tensor([[    2, 37968]]), 'anxious': tensor([[    2,   481, 24192]]), 'calm': tensor([[     2, 116051]]), 'depressed': tensor([[   2, 3243, 3734]]), 'elated': tensor([[  2, 521, 840]])}
2:  <bos>
3243:  dep
3734:  ressed
521:  el
840:  ated


In [15]:
print(tokenizer("angry", return_tensors='pt'))
print(tokenizer("scared", return_tensors='pt'))
print(tokenizer("scared", return_tensors='pt'))

{'input_ids': tensor([[    2, 70709]]), 'attention_mask': tensor([[1, 1]])}
{'input_ids': tensor([[     2, 221959]]), 'attention_mask': tensor([[1, 1]])}


In [10]:
def word_to_embeddings(word: str):
    # tokenize, and squeeze tensor
    token_id = tokenizer(word, return_tensors="pt")['input_ids'].squeeze()

    # google gemma put <bos> in front of every word, remove <bos>
    token_id = token_id[1:]

    # make a forward pass through custom model
    embeddings = emb_model(token_id)

    return embeddings

print(word_to_embeddings("happy"))
print(word_to_embeddings("depressed"))

tensor([[ 0.2412, -0.0518,  0.0339,  ..., -0.0024,  0.0130, -0.0569]],
       grad_fn=<EmbeddingBackward0>)
tensor([[ 0.2578,  0.0223, -0.0747,  ..., -0.0771, -0.0238,  0.0240],
        [ 0.2207,  0.0103,  0.0303,  ..., -0.0311,  0.0850,  0.0674]],
       grad_fn=<EmbeddingBackward0>)


In [11]:
emo_embeddings = {emo: word_to_embeddings(emo) for emo in emotions}

In [12]:
print("After one forward pass through embedding layer.")
print("embeddings: ", {k: v.shape for k, v in emo_embeddings.items()})

After one forward pass through embedding layer.
embeddings:  {'happy': torch.Size([1, 2048]), 'sad': torch.Size([1, 2048]), 'anxious': torch.Size([2, 2048]), 'calm': torch.Size([1, 2048]), 'depressed': torch.Size([2, 2048]), 'elated': torch.Size([2, 2048])}


In [13]:
# Understand how the model splits words
dep = 'dep'
ressed = 'ressed'
dep_emb = word_to_embeddings(dep)
ressed_emb = word_to_embeddings(ressed)
print(dep_emb.shape, ressed_emb.shape)
print(f.cosine_similarity(dep_emb, ressed_emb, dim=1))
print(cdist(dep_emb, ressed_emb))

torch.Size([1, 2048]) torch.Size([1, 2048])
tensor([0.7595], grad_fn=<SumBackward1>)
tensor([[3.8702]], grad_fn=<CdistBackward0>)


In [14]:
# cosine similarity measures angle between vectors (direction, not magnitude), it measures semantic similarity
# The resulting similarity ranges from −1 meaning exactly opposite, to +1 meaning exactly the same, with 0 indicating orthogonality or decorrelation, while in-between values indicate intermediate similarity or dissimilarity.
# https://www.learndatasci.com/glossary/cosine-similarity/
# https://docs.pytorch.org/docs/stable/generated/torch.nn.CosineSimilarity.html
# Dimension variable expected to be in range of [-3, 2]

print("happy, sad cos: ", f.cosine_similarity(emo_embeddings["happy"], emo_embeddings["sad"]))
print("happy, sad euclid: ", cdist(emo_embeddings["happy"], emo_embeddings["sad"]))
print("happy, calm cos: ", f.cosine_similarity(emo_embeddings["happy"], emo_embeddings["calm"]))
print("happy, calm euclid: ", cdist(emo_embeddings["happy"], emo_embeddings["calm"]))
print("Note: depressed is split into dep and pressed.")
print("happy, depressed cos: ", f.cosine_similarity(emo_embeddings["happy"], emo_embeddings["depressed"]))
print("happy, depressed euclid: ", cdist(emo_embeddings["happy"], emo_embeddings["depressed"]))
print("Note: elated is split into el and ated.")
print("happy, elated cos: ", f.cosine_similarity(emo_embeddings["happy"], emo_embeddings["elated"]))
print("happy, elated euclid: ", cdist(emo_embeddings["happy"], emo_embeddings["elated"]))

happy, sad cos:  tensor([0.8699], grad_fn=<SumBackward1>)
happy, sad euclid:  tensor([[3.2664]], grad_fn=<CdistBackward0>)
happy, calm cos:  tensor([0.8769], grad_fn=<SumBackward1>)
happy, calm euclid:  tensor([[4.3406]], grad_fn=<CdistBackward0>)
Note: depressed is split into dep and pressed.
happy, depressed cos:  tensor([0.7885, 0.8162], grad_fn=<SumBackward1>)
happy, depressed euclid:  tensor([[3.6696, 3.4826]], grad_fn=<CdistBackward0>)
Note: elated is split into el and ated.
happy, elated cos:  tensor([0.7766, 0.7726], grad_fn=<SumBackward1>)
happy, elated euclid:  tensor([[3.6912, 3.6946]], grad_fn=<CdistBackward0>)
