In [24]:
import gc
import warnings
from collections import OrderedDict
from functools import partial
from itertools import combinations

import plotly.express as px
import torch.nn.functional as f
from pandas import DataFrame
from torch import cuda, cdist
from transformers import AutoModelForCausalLM, AutoTokenizer

warnings.simplefilter(action='ignore', category=Warning)

In [25]:
MODEL = "google/gemma-2-2b"
DEVICE = "cpu"

In [26]:
cuda.empty_cache()
gc.collect()
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map=DEVICE,
    torch_dtype="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [41]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
emotions = ["happy", "elated", "peaceful", "calm", "content", "relaxed", "sad", "anxious", "fearful", "scared",
            "depressed", "lonely", "bitter", "jealous", "angry", "guilty", "passionate", "brave", "confident"]

# squeeze tensor dim, strip <bos>
print("orignal shape: ", tokenizer("happy", return_tensors='pt').input_ids)
emo_tokens = OrderedDict({emo: tokenizer(emo, return_tensors='pt').input_ids for emo in emotions})

## The bug appear after removing .unsqueeze(0).unsqueeze(0), indicating maybe the same token gotten ran across different words. Maybe it's the leading 2?

print(emo_tokens)

orignal shape:  tensor([[    2, 11896]])
OrderedDict({'happy': tensor([[    2, 11896]]), 'elated': tensor([[  2, 521, 840]]), 'peaceful': tensor([[     2, 211749]]), 'calm': tensor([[     2, 116051]]), 'content': tensor([[   2, 3312]]), 'relaxed': tensor([[     2, 163861]]), 'sad': tensor([[    2, 37968]]), 'anxious': tensor([[    2,   481, 24192]]), 'fearful': tensor([[    2, 71339,  1329]]), 'scared': tensor([[     2, 221959]]), 'depressed': tensor([[   2, 3243, 3734]]), 'lonely': tensor([[     2, 151738]]), 'bitter': tensor([[     2, 158930]]), 'jealous': tensor([[    2,  1792, 22108]]), 'angry': tensor([[    2, 70709]]), 'guilty': tensor([[     2, 206971]]), 'passionate': tensor([[    2, 94364,   607]]), 'brave': tensor([[     2, 149142]]), 'confident': tensor([[     2, 131181]])})


In [37]:
# Create a table to record initial embeddings, and then output after selected layers
layers = list(range(len(model.model.layers)+1))
layer_embs = DataFrame(index=emotions, columns=layers)
print(layer_embs)
print(layer_embs[0]['happy'])

             0    1    2    3    4    5    6    7    8    9   ...   17   18  \
happy       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
elated      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
peaceful    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
calm        NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
content     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
relaxed     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
sad         NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
anxious     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
fearful     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
scared      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
depressed   NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
lonely      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  

In [38]:
def pandas_hook(module, input, output, word, layer_id):
    if layer_id == 0: # put in initial embeddings
        layer_embs[layer_id][word] = input[0].squeeze()
    layer_embs[layer_id+1][word] = output[0].squeeze()

def pass_word_through_model(word):
    for l in layers[:-1]:
            model.model.layers[l]._forward_hooks = OrderedDict() # clear all the old hooks first
            model.model.layers[l].register_forward_hook(partial(pandas_hook, word=word, layer_id=l))
    model(emo_tokens[word])
 # model.model separates out attention head

In [39]:
for emo in emotions:
    pass_word_through_model(emo)
print(layer_embs[0]['happy'])
print(layer_embs)

tensor([-0.5460,  2.3400, -2.8969,  ..., -0.9964,  2.3312, -0.4533],
       grad_fn=<SqueezeBackward0>)
                                                           0   \
happy       [tensor(-0.5460, grad_fn=<UnbindBackward0>), t...   
elated      [tensor(1.6869, grad_fn=<UnbindBackward0>), te...   
peaceful    [tensor(2.2595, grad_fn=<UnbindBackward0>), te...   
calm        [tensor(-1.0288, grad_fn=<UnbindBackward0>), t...   
content     [tensor(1.1354, grad_fn=<UnbindBackward0>), te...   
relaxed     [tensor(2.5811, grad_fn=<UnbindBackward0>), te...   
sad         [tensor(2.1290, grad_fn=<UnbindBackward0>), te...   
anxious     [tensor(-0.9365, grad_fn=<UnbindBackward0>), t...   
fearful     [tensor(1.5781, grad_fn=<UnbindBackward0>), te...   
scared      [tensor(-0.9843, grad_fn=<UnbindBackward0>), t...   
depressed   [tensor(-0.3469, grad_fn=<UnbindBackward0>), t...   
lonely      [tensor(-2.2068, grad_fn=<UnbindBackward0>), t...   
bitter      [tensor(0.2334, grad_fn=<UnbindBackward

In [21]:
def cos_dist(emb1, emb2):
    return 1 - f.cosine_similarity(emb1, emb2, dim=0).item()

cos_dist_figs = []

for l in layers:
    df = DataFrame(index=emotions, columns=emotions)

    for p in combinations(emotions, r=2):
        df[p[0]][p[1]]= cos_dist(layer_embs[l][p[0]], layer_embs[l][p[1]])


    fig = px.imshow(df, title="Cosine Distance between Emotions at Layer " + str(l), labels=dict(color="Cosine Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu', range_color=[0.13, 1.26])
    cos_dist_figs.append(fig)

fig = px.imshow(df, title="Cosine Distance at the End of the Model (Colorscale Normalized)", labels=dict(color="Cosine Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
cos_dist_figs.append(fig)

In [22]:
def euc(emb1, emb2):
    return cdist(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()

euc_dist_figs = []

for l in layers:
    df = DataFrame(index=emotions+["depressed-avg", "elated-avg"], columns=emotions+["depressed-avg", "elated-avg"])

    for p in combinations(emotions, r=2):
        df[p[0]][p[1]]= euc(layer_embs[l][p[0]], layer_embs[l][p[1]])

    fig = px.imshow(df, title="Euclidean Distance between Emotions at Layer " + str(l), labels=dict(color="Euclidean Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu', range_color=[44, 925])
    euc_dist_figs.append(fig)

fig = px.imshow(df, title="Euclidean Distances at the End of the Model (Colorscale Normalized)", labels=dict(color="Euclidean Distance"), text_auto=True, aspect="auto", color_continuous_scale='RdBu')
euc_dist_figs.append(fig)

In [23]:
def figures_to_html(figs, filename="dashboard.html"):
    with open(filename, 'w', encoding='utf-8') as dashboard:
        dashboard.write("<html><head></head><body>" + "\n")
        for fig in figs:
            inner_html = fig.to_html().split('<body>')[1].split('</body>')[0]
            dashboard.write(inner_html)
        dashboard.write("</body></html>" + "\n")

figures_to_html(cos_dist_figs + euc_dist_figs, filename="results/06.layer_distances.html")