In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import cuda, cdist
from collections import OrderedDict
import gc
from functools import partial
from pandas import DataFrame
import warnings
import torch.nn.functional as f
from itertools import combinations
import plotly.express as px
warnings.simplefilter(action='ignore')

In [8]:
MODEL = "google/gemma-2-2b"
DEVICE = "cpu"

In [9]:
cuda.empty_cache()
gc.collect()
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map=DEVICE,
    torch_dtype="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
emotions = ["happy", "calm", "sad", "anxious",  "angry", "scared", "fearful", "el", "ated", "dep", "ressed"]

# squeeze tensor dim, strip <bos>
print("orignal shape: ", tokenizer("happy", return_tensors='pt').input_ids)
emo_tokens = OrderedDict({emo: tokenizer(emo, return_tensors='pt').input_ids.squeeze()[1].unsqueeze(0).unsqueeze(0) for emo in emotions})
print(emo_tokens)

orignal shape:  tensor([[    2, 11896]])
OrderedDict({'happy': tensor([[11896]]), 'calm': tensor([[116051]]), 'sad': tensor([[37968]]), 'anxious': tensor([[481]]), 'lonely': tensor([[151738]]), 'bitter': tensor([[158930]]), 'depressed': tensor([[3243]]), 'angry': tensor([[70709]]), 'scared': tensor([[221959]]), 'fearful': tensor([[71339]]), 'el': tensor([[521]]), 'ated': tensor([[840]]), 'dep': tensor([[3243]]), 'ressed': tensor([[3734]])})


In [11]:
# Create a table to record initial embeddings, and then output after selected layers
layers = list(range(27))
layer_embs = DataFrame(index=emotions, columns=layers)
print(layer_embs)
print(layer_embs[0]['happy'])

          0    1    2    3    4    5    6    7    8    9   ...   17   18   19  \
happy    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
calm     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
sad      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
anxious  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
angry    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
scared   NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
fearful  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
el       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
ated     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
dep      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
ressed   NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   

          20   21   22   23

In [12]:
def pandas_hook(module, input, output, word, layer_id):
    if layer_id == 0: # put in initial embeddings
        layer_embs[0][word] = input[0].squeeze()
    layer_embs[layer_id+1][word] = output[0].squeeze()

def pass_word_through_model(word):
    for l in layers[:-1]:
            model.model.layers[l]._forward_hooks = OrderedDict() # clear all the old hooks first
            model.model.layers[l].register_forward_hook(partial(pandas_hook, word=word, layer_id=l))
    model(emo_tokens[word])
 # model.model separates out attention head

In [13]:
for emo in emotions:
    pass_word_through_model(emo)

                                                        0   \
happy    [tensor(-0.5460, grad_fn=<UnbindBackward0>), t...   
calm     [tensor(-1.0288, grad_fn=<UnbindBackward0>), t...   
sad      [tensor(2.1290, grad_fn=<UnbindBackward0>), te...   
anxious  [tensor(-0.9365, grad_fn=<UnbindBackward0>), t...   
angry    [tensor(2.2207, grad_fn=<UnbindBackward0>), te...   
scared   [tensor(-0.9843, grad_fn=<UnbindBackward0>), t...   
fearful  [tensor(1.5781, grad_fn=<UnbindBackward0>), te...   
el       [tensor(1.6869, grad_fn=<UnbindBackward0>), te...   
ated     [tensor(1.6462, grad_fn=<UnbindBackward0>), te...   
dep      [tensor(-0.3469, grad_fn=<UnbindBackward0>), t...   
ressed   [tensor(-0.9824, grad_fn=<UnbindBackward0>), t...   

                                                        1   \
happy    [tensor(-0.9465, grad_fn=<UnbindBackward0>), t...   
calm     [tensor(-1.3839, grad_fn=<UnbindBackward0>), t...   
sad      [tensor(1.6476, grad_fn=<UnbindBackward0>), te...   
anxious

In [None]:
def figures_to_html(figs, filename="dashboard.html"):
    with open(filename, 'w', encoding='utf-8') as dashboard:
        dashboard.write("<html><head></head><body>" + "\n")
        for fig in figs:
            inner_html = fig.to_html().split('<body>')[1].split('</body>')[0]
            dashboard.write(inner_html)
        dashboard.write("</body></html>" + "\n")

def cos_dist(emb1, emb2):
    return 1 - f.cosine_similarity(emb1, emb2, dim=0).item()

def euc(emb1, emb2):
    return cdist(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()

In [None]:
cos_dist_figs = []

for l in layers:
    df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])

    for p in combinations(emotions, r=2):
        df[p[0]][p[1]]= cos_dist(layer_embs[l][p[0]], layer_embs[l][p[1]])

    for emo in emotions:
        df[emo]['depressed-avg'] = (cos_dist(layer_embs[l][emo], layer_embs[l]['dep']) + cos_dist(layer_embs[l][emo], layer_embs[l]['ressed'])) / 2
        df[emo]['elated-avg'] = (cos_dist(layer_embs[l][emo], layer_embs[l]['el']) + cos_dist(layer_embs[l][emo], layer_embs[l]['ated'])) / 2
        df['depressed-avg']['elated-avg'] = (cos_dist(layer_embs[l]['dep'], layer_embs[l]['el']) + cos_dist(layer_embs[l]['ressed'], layer_embs[l]['ated'])) / 2

    fig = px.imshow(df, title="Cosine Distance between Emotions at Layer " + str(l), labels=dict(color="Cosine Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu', range_color=[0.13, 1.26])
    cos_dist_figs.append(fig)

fig = px.imshow(df, title="Cosine Distance at the End of the Model (Colorscale Normalized)", labels=dict(color="Cosine Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
cos_dist_figs.append(fig)

In [172]:
euc_dist_figs = []

for l in layers:
    df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])

    for p in combinations(emotions, r=2):
        df[p[0]][p[1]]= euc(layer_embs[l][p[0]], layer_embs[l][p[1]])

    for emo in emotions:
        df[emo]['depressed-avg'] = (euc(layer_embs[l][emo], layer_embs[l]['dep']) + euc(layer_embs[l][emo], layer_embs[l]['ressed'])) / 2
        df[emo]['elated-avg'] = (euc(layer_embs[l][emo], layer_embs[l]['el']) + euc(layer_embs[l][emo], layer_embs[l]['ated'])) / 2
        df['depressed-avg']['elated-avg'] = (euc(layer_embs[l]['dep'], layer_embs[l]['el']) + euc(layer_embs[l]['ressed'], layer_embs[l]['ated'])) / 2

    fig = px.imshow(df, title="Euclidean Distance between Emotions at Layer " + str(l), labels=dict(color="Euclidean Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu', range_color=[44, 925])
    euc_dist_figs.append(fig)

fig = px.imshow(df, title="Euclidean Distances at the End of the Model (Colorscale Normalized)", labels=dict(color="Euclidean Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
euc_dist_figs.append(fig)

In [173]:
figures_to_html(cos_dist_figs + euc_dist_figs, filename="results/06.layer_distances.html")