In [1]:
from torch import load, nn, cdist, stack
import torch.nn.functional as f
from transformers import AutoTokenizer
from pandas import DataFrame
import plotly.express as px
from collections import OrderedDict
from itertools import combinations
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
MODEL = "google/gemma-2b"
EMBEDDING_FILE = "embeddings_google_gemma-2b.pth"

In [3]:
# load pretrained tokenizer from model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# load local embedding file
saved_embeddings = load(EMBEDDING_FILE)
if 'weight' not in saved_embeddings:
    raise KeyError("The saved embeddings file does not contain 'weight' key.")
embeddings_tensor = saved_embeddings['weight']

# Create an embedding only model object
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# Create an empty LLM model that has the same shape with extracted embeddings
emb_model = EmbeddingModel(*embeddings_tensor.size())

# Give the LLM model the weight of extracted embeddings
emb_model.embedding.weight.data = embeddings_tensor
emb_model.eval()

EmbeddingModel(
  (embedding): Embedding(256000, 2048)
)

In [4]:
emotions = ["happy", "calm", "sad", "anxious",  "angry", "scared", "fearful", "el", "ated", "dep", "ressed"]

# squeeze tensor dim, strip <bos>
emo_tokens = OrderedDict({emo: tokenizer(emo, return_tensors='pt').input_ids.squeeze()[1] for emo in emotions})

In [5]:
emo_embs = OrderedDict({k: emb_model(v) for k, v in emo_tokens.items()})

In [6]:
# sanity check
print(f.cosine_similarity(emo_embs['happy'], emo_embs['happy'], dim=0))

tensor(1.0000, grad_fn=<SumBackward1>)


In [7]:
emo_pairs = list(combinations(emo_embs, r=2))

In [10]:
# put all the numbers in pandas df then plot
def cos(emb1, emb2):
    return f.cosine_similarity(emb1, emb2, dim=0).item()

df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])
for p in combinations(emotions, r=2):
    df[p[0]][p[1]]= cos(emo_embs[p[0]], emo_embs[p[1]])

# handle averaging for depressed and elated
for emo in emotions:
    df[emo]['depressed-avg'] = (cos(emo_embs[emo], emo_embs['dep']) + cos(emo_embs[emo], emo_embs['ressed'])) / 2
    df[emo]['elated-avg'] = (cos(emo_embs[emo], emo_embs['el']) + cos(emo_embs[emo], emo_embs['ated'])) / 2
df['depressed-avg']['elated-avg'] = (cos(emo_embs['dep'], emo_embs['el']) + cos(emo_embs['ressed'], emo_embs['ated'])) / 2

fig = px.imshow(df, title="Cosine Similarity between Emotions")
print("Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the average cosine distance.")
fig.show()
fig.write_html("cos_dist_vis.html")

Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the average cosine distance.


In [11]:
# put all the numbers in pandas df then plot
def euc(emb1, emb2):
    return cdist(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()

df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])
for p in combinations(emotions, r=2):
    df[p[0]][p[1]]= euc(emo_embs[p[0]], emo_embs[p[1]])

# handle averaging for depressed and elated
for emo in emotions:
    df[emo]['depressed-avg'] = (euc(emo_embs[emo], emo_embs['dep']) + euc(emo_embs[emo], emo_embs['ressed'])) / 2
    df[emo]['elated-avg'] = (euc(emo_embs[emo], emo_embs['el']) + euc(emo_embs[emo], emo_embs['ated'])) / 2
df['depressed-avg']['elated-avg'] = (euc(emo_embs['dep'], emo_embs['el']) + euc(emo_embs['ressed'], emo_embs['ated'])) / 2

fig = px.imshow(df, title="Euclidean Distance between Emotions")
print(print("Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the euclidean distance."))
fig.show()
fig.write_html("euc_dist_vis.html")

Note: dep-ressed and el-ated got split into two embeddings in the LLM, we are taking the euclidean distance.
None
