In [2]:
import gc
gc.collect()

601

In [None]:
from utils import *
from torch import load, nn
import torch.nn.functional as f
from transformers import AutoTokenizer
from pandas import DataFrame
import plotly.express as px
from collections import OrderedDict
from itertools import combinations
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
MODEL = "google/gemma-2-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2-2b.pth"
DEVICE = "cuda:0" # run on my gpu

In [5]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']

# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# Create an empty LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size()).to(DEVICE)

# Give the LLM model the weight of extracted embeddings
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2304)
)

In [6]:
print(tokenizer("depressed", return_tensors='pt').input_ids)
print(tokenizer("elated", return_tensors='pt').input_ids)
print(tokenizer.decode(3243))
print(tokenizer.decode(3734))
print(tokenizer.decode(521))
print(tokenizer.decode(840))

tensor([[   2, 3243, 3734]])
tensor([[  2, 521, 840]])
dep
ressed
el
ated


In [10]:
emotions = ["happy", "calm", "content", "sad", "anxious", "lonely", "bitter", "angry", "scared", "fearful"]
print({emo: tokenizer(emo, return_tensors='pt').input_ids for emo in emotions})

for emo in emotions:
    

# squeeze tensor dim, strip <bos>
emo_tokens = OrderedDict({emo: tokenizer(emo, return_tensors='pt').input_ids.squeeze()[1] for emo in emotions})
print(emo_tokens)

{'happy': tensor([[    2, 11896]]), 'calm': tensor([[     2, 116051]]), 'content': tensor([[   2, 3312]]), 'sad': tensor([[    2, 37968]]), 'anxious': tensor([[    2,   481, 24192]]), 'lonely': tensor([[     2, 151738]]), 'bitter': tensor([[     2, 158930]]), 'angry': tensor([[    2, 70709]]), 'scared': tensor([[     2, 221959]]), 'fearful': tensor([[    2, 71339,  1329]]), 'el': tensor([[  2, 521]]), 'ated': tensor([[  2, 840]]), 'dep': tensor([[   2, 3243]]), 'ressed': tensor([[   2, 3734]])}
OrderedDict({'happy': tensor(11896), 'calm': tensor(116051), 'content': tensor(3312), 'sad': tensor(37968), 'anxious': tensor(481), 'lonely': tensor(151738), 'bitter': tensor(158930), 'angry': tensor(70709), 'scared': tensor(221959), 'fearful': tensor(71339), 'el': tensor(521), 'ated': tensor(840), 'dep': tensor(3243), 'ressed': tensor(3734)})


In [None]:
emo_embs = OrderedDict({k: emb_model(v) for k, v in emo_tokens.items()})

In [7]:
# sanity check
print(f.cosine_similarity(emo_embs['happy'], emo_embs['happy'], dim=0))

tensor(1.0000, grad_fn=<SumBackward1>)


In [8]:
emo_pairs = list(combinations(emo_embs, r=2))

In [None]:
# put all the numbers in pandas df then plot


df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])
for p in combinations(emotions, r=2):
    df[p[0]][p[1]]= cos_dist(emo_embs[p[0]], emo_embs[p[1]])

# handle averaging for depressed and elated
for emo in emotions:
    df[emo]['depressed-avg'] = (cos_dist(emo_embs[emo], emo_embs['dep']) + cos_dist(emo_embs[emo], emo_embs['ressed'])) / 2
    df[emo]['elated-avg'] = (cos_dist(emo_embs[emo], emo_embs['el']) + cos_dist(emo_embs[emo], emo_embs['ated'])) / 2
df['depressed-avg']['elated-avg'] = (cos_dist(emo_embs['dep'], emo_embs['el']) + cos_dist(emo_embs['ressed'], emo_embs['ated'])) / 2

fig1 = px.imshow(df, title="Cosine Distance between Emotions", labels=dict(color="Cosine Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
print("Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the average cosine distance.")
fig1.show()

In [None]:
# put all the numbers in pandas df then plot


df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])
for p in combinations(emotions, r=2):
    df[p[0]][p[1]]= euc(emo_embs[p[0]], emo_embs[p[1]])

# handle averaging for depressed and elated
for emo in emotions:
    df[emo]['depressed-avg'] = (euc(emo_embs[emo], emo_embs['dep']) + euc(emo_embs[emo], emo_embs['ressed'])) / 2
    df[emo]['elated-avg'] = (euc(emo_embs[emo], emo_embs['el']) + euc(emo_embs[emo], emo_embs['ated'])) / 2
df['depressed-avg']['elated-avg'] = (euc(emo_embs['dep'], emo_embs['el']) + euc(emo_embs['ressed'], emo_embs['ated'])) / 2

fig2 = px.imshow(df, title="Euclidean Distance between Emotions", labels=dict(color="Euclidean Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
print(print("Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the euclidean distance."))
fig2.show()

In [None]:
figures_to_html([fig1, fig2], filename="results/03.emotion_distances.html")