In [2]:
from torch import load, nn, cdist
from transformers import AutoModel, AutoTokenizer

In [3]:
MODEL = "google/gemma-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2b.pth"
DEVICE = "cuda:0" # run on my gpu

In [4]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [5]:
# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']

In [6]:
# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# Create an empty LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size())

# Give the LLM model the weight of extracted embeddings
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2048)
)

In [7]:
def word_to_embeddings(word: str):
    # tokenize
    token_id = tokenizer(word, return_tensors="pt")['input_ids']

    # make a forward pass through custom model
    embeddings = emb_model(token_id)

    return embeddings

In [8]:
emotions = ["happy", "sad", "anxious", "calm", "depressed", "elated"]
emo_embeddings = {emo: word_to_embeddings(emo) for emo in emotions}

In [49]:
emotion_tokens = {emo: tokenizer(emo, return_tensors='pt').input_ids for emo in emotions}
print("tokens: ", emotion_tokens)
print("2: ", tokenizer.decode(2))
print("3243: ", tokenizer.decode(3243))
print("3734: ", tokenizer.decode(3734))
print("After one forward pass through embedding layer.")
print("embeddings: ", {k: v.shape for k, v in emo_embeddings.items()})

tokens:  {'happy': tensor([[    2, 11896]]), 'sad': tensor([[    2, 37968]]), 'anxious': tensor([[    2,   481, 24192]]), 'calm': tensor([[     2, 116051]]), 'depressed': tensor([[   2, 3243, 3734]]), 'elated': tensor([[  2, 521, 840]])}
2:  <bos>
3243:  dep
3734:  ressed
After one forward pass through embedding layer.
embeddings:  {'happy': torch.Size([1, 2, 2048]), 'sad': torch.Size([1, 2, 2048]), 'anxious': torch.Size([1, 3, 2048]), 'calm': torch.Size([1, 2, 2048]), 'depressed': torch.Size([1, 3, 2048]), 'elated': torch.Size([1, 3, 2048])}


In [46]:
# cosine similarity measures angle between vectors (direction, not magnitude), it measures semantic similarity
# https://www.learndatasci.com/glossary/cosine-similarity/
# https://docs.pytorch.org/docs/stable/generated/torch.nn.CosineSimilarity.html
# Dimension variable expected to be in range of [-3, 2]
cos = nn.CosineSimilarity(dim=2)

cos_dist = cos(emo_embeddings["happy"], emo_embeddings["sad"])
print(cos_dist)
print(cos_dist.shape)


tensor([[1.0000, 0.8699]], grad_fn=<SumBackward1>)
torch.Size([1, 2])


In [47]:
# Euclidean for magnitude differences
euclid_dist = cdist(emo_embeddings["happy"], emo_embeddings["sad"])
print(euclid_dist)
print(euclid_dist.shape)

euclid_dist = cdist(emo_embeddings["happy"], emo_embeddings["depressed"])
print(euclid_dist)
print(euclid_dist.shape)


tensor([[[ 0.0000, 15.3405],
         [15.0511,  3.2664]]], grad_fn=<CdistBackward0>)
torch.Size([1, 2, 2])
tensor([[[ 0.0000, 15.0569, 15.0977],
         [15.0511,  3.6696,  3.4826]]], grad_fn=<CdistBackward0>)
torch.Size([1, 2, 3])
