In [2]:
from torch import save, load, nn
from transformers import AutoModel, AutoTokenizer

In [3]:
MODEL = "google/gemma-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2b.pth"
DEVICE = "cuda:0" # run on my gpu

In [4]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [5]:
# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']

In [6]:
# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# Create an LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size())
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2048)
)

In [7]:
def word_to_embeddings(word: str):
    # tokenize
    token_id = tokenizer(word, return_tensors="pt")['input_ids']

    # make a forward pass through custom model
    embeddings = emb_model(token_id)

    return embeddings

In [9]:
emotions = ["happy", "sad", "anxious", "calm", "depressed", "elated"]
emo_embeddings = [word_to_embeddings(emo) for emo in emotions]

In [None]:
def cosine_sim(word1: str, word2: str):
    # cosine similarity measures angle between vectors (direction, not magnitude), it measures semantic similarity
    # https://www.learndatasci.com/glossary/cosine-similarity/

