In [151]:
import gc

gc.collect()

4652

In [152]:
from torch import load, nn, sum, cdist
import torch.nn.functional as f
from transformers import AutoTokenizer
from pandas import DataFrame
import plotly.express as px
from collections import OrderedDict
from itertools import combinations
from utils import *
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [153]:
MODEL = "google/gemma-2-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2-2b.pth"
DEVICE = "cuda:0"  # run on my gpu

In [154]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']


# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)


# Create an empty LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size()).to(DEVICE)

# Give the LLM model the weight of extracted embeddings
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2304)
)

In [155]:
print(tokenizer("depressed", return_tensors='pt').input_ids)
print(tokenizer("elated", return_tensors='pt').input_ids)
print(tokenizer.decode(3243))
print(tokenizer.decode(3734))
print(tokenizer.decode(521))
print(tokenizer.decode(840))

tensor([[   2, 3243, 3734]])
tensor([[  2, 521, 840]])
dep
ressed
el
ated


In [156]:
emotions = ["happy", "elated", "peaceful", "calm", "content", "relaxed", "sad", "anxious", "fearful", "scared",
            "depressed", "lonely", "bitter", "jealous", "angry", "guilty", "passionate", "brave", "confident"]

# squeeze tensor dim
emo_tokens = OrderedDict({emo: tokenizer(emo, return_tensors='pt').input_ids for emo in emotions})
print(emo_tokens)

OrderedDict({'happy': tensor([[    2, 11896]]), 'elated': tensor([[  2, 521, 840]]), 'peaceful': tensor([[     2, 211749]]), 'calm': tensor([[     2, 116051]]), 'content': tensor([[   2, 3312]]), 'relaxed': tensor([[     2, 163861]]), 'sad': tensor([[    2, 37968]]), 'anxious': tensor([[    2,   481, 24192]]), 'fearful': tensor([[    2, 71339,  1329]]), 'scared': tensor([[     2, 221959]]), 'depressed': tensor([[   2, 3243, 3734]]), 'lonely': tensor([[     2, 151738]]), 'bitter': tensor([[     2, 158930]]), 'jealous': tensor([[    2,  1792, 22108]]), 'angry': tensor([[    2, 70709]]), 'guilty': tensor([[     2, 206971]]), 'passionate': tensor([[    2, 94364,   607]]), 'brave': tensor([[     2, 149142]]), 'confident': tensor([[     2, 131181]])})


In [157]:
emo_embs = OrderedDict({k: emb_model(v) for k, v in emo_tokens.items()})

In [158]:
# sanity check
print(f.cosine_similarity(emo_embs['happy'], emo_embs['happy'], dim=0))

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SumBackward1>)


In [159]:
emo_pairs = list(combinations(emo_embs, r=2))

In [160]:
# put all the numbers in pandas df then plot

df = DataFrame(index=emotions, columns=emotions)
for p in combinations(emotions, r=2):
    df[p[0]][p[1]] = 1 - f.cosine_similarity(sum(emo_embs[p[0]], 1), sum(emo_embs[p[1]], 1), dim=1).item()
    df[p[1]][p[0]] = 1 - f.cosine_similarity(sum(emo_embs[p[1]], 1), sum(emo_embs[p[0]], 1), dim=1).item()

fig1 = px.imshow(df, title="Cosine Distance between Emotions", labels=dict(color="Cosine Distance"), text_auto=True,
                 aspect="auto", color_continuous_scale='RdBu')
fig1.show()

In [161]:
# put all the numbers in pandas df then plot
df2 = DataFrame(index=emotions, columns=emotions)
for p in combinations(emotions, r=2):
    df2[p[0]][p[1]] = cdist(sum(emo_embs[p[0]], 1), sum(emo_embs[p[1]], 1), p=2).squeeze().item()
    df2[p[1]][p[0]] = cdist(sum(emo_embs[p[1]], 1), sum(emo_embs[p[0]], 1), p=2).squeeze().item()

fig2 = px.imshow(df2, title="Euclidean Distance between Emotions", labels=dict(color="Euclidean Distance"),
                 text_auto=True, aspect="auto", color_continuous_scale='RdBu')
fig2.show()

In [162]:
figures_to_html([fig1, fig2], filename="results/03.emotion_distances.html")