#### NOTE: this is notebook is the first iteration of human-model pairwise RSA. 
A newer and more improvised version of this notebook can be found in notebooks/human_model_rsa_v2.ipynb:
- Changed model from GPT-2 to Qwen 
- Replace human similarity matrix with machine-generated data
- Replace mean pooling with last token in subtoken sequence


In [1]:
# pair_similarities
# key - (pair_1: str, pair_2: str)
# val - similarity_score: int
pair_similarities = {}

with open('WordSim_sense.txt', 'r') as f:
  for line in f:
    pair_1, pair_2, similiarity_score = line.split()
    pair_similarities[(pair_1.split('#', 1)[0], pair_2.split('#', 1)[0])] = float(similiarity_score)

print(pair_similarities)

{('glass', 'metal'): 5.56, ('bird', 'crane'): 7.38, ('coast', 'shore'): 9.1, ('championship', 'tournament'): 8.36, ('football', 'soccer'): 9.03, ('planet', 'moon'): 8.08, ('planet', 'star'): 8.45, ('professor', 'doctor'): 6.62, ('medium', 'radio'): 7.42, ('precedent', 'antecedent'): 6.04, ('murder', 'manslaughter'): 8.53, ('bread', 'butter'): 6.19, ('shower', 'thunderstorm'): 6.31, ('dollar', 'buck'): 9.22, ('king', 'queen'): 8.58, ('lobster', 'food'): 7.81, ('Harvard', 'Yale'): 8.13, ('precedent', 'example'): 5.85, ('car', 'automobile'): 8.94, ('tiger', 'feline'): 8.0, ('cup', 'object'): 3.69, ('century', 'year'): 7.59, ('furnace', 'stove'): 8.79, ('tiger', 'fauna'): 5.62, ('tiger', 'jaguar'): 8.0, ('tiger', 'mammal'): 6.85, ('opera', 'performance'): 6.88, ('money', 'cash'): 9.15, ('vodka', 'gin'): 8.46, ('seafood', 'food'): 8.34, ('cell', 'phone'): 7.81, ('boy', 'lad'): 8.83, ('marathon', 'sprint'): 7.47, ('train', 'car'): 6.31, ('skin', 'eye'): 6.22, ('street', 'avenue'): 8.88, ('st

In [2]:
# n_pair_similarities
# normalized pair_similarities, bounded score to [0,1]
n_pair_similarities = {k: round(v/10, 3) for k, v in pair_similarities.items()}
print(n_pair_similarities)

{('glass', 'metal'): 0.556, ('bird', 'crane'): 0.738, ('coast', 'shore'): 0.91, ('championship', 'tournament'): 0.836, ('football', 'soccer'): 0.903, ('planet', 'moon'): 0.808, ('planet', 'star'): 0.845, ('professor', 'doctor'): 0.662, ('medium', 'radio'): 0.742, ('precedent', 'antecedent'): 0.604, ('murder', 'manslaughter'): 0.853, ('bread', 'butter'): 0.619, ('shower', 'thunderstorm'): 0.631, ('dollar', 'buck'): 0.922, ('king', 'queen'): 0.858, ('lobster', 'food'): 0.781, ('Harvard', 'Yale'): 0.813, ('precedent', 'example'): 0.585, ('car', 'automobile'): 0.894, ('tiger', 'feline'): 0.8, ('cup', 'object'): 0.369, ('century', 'year'): 0.759, ('furnace', 'stove'): 0.879, ('tiger', 'fauna'): 0.562, ('tiger', 'jaguar'): 0.8, ('tiger', 'mammal'): 0.685, ('opera', 'performance'): 0.688, ('money', 'cash'): 0.915, ('vodka', 'gin'): 0.846, ('seafood', 'food'): 0.834, ('cell', 'phone'): 0.781, ('boy', 'lad'): 0.883, ('marathon', 'sprint'): 0.747, ('train', 'car'): 0.631, ('skin', 'eye'): 0.622,

In [3]:
import json

def encode_dict_to_json(data: dict, filename: str) -> None:
  """
  Encode the pair_similarities so that we can store tuples
  while following JSON formatting by seperating keys with '|'
  """
  encoded_dict = {k1 + '|' + k2: v for (k1,k2), v in data.items()}
  with open(filename, 'w') as f:
    json.dump(encoded_dict, f, indent=4)
    print("Dictionary saved as ", filename)

def decode_json_to_dict(filename: str) -> dict:
  """
  Decode the pair_similarities, removing the '|' from the keys
  and turning the keys back into a tuple format
  """
  with open(filename, 'r') as f:
    data = json.load(f)
  decoded_dict = {tuple(k.split('|')): val for k, val in data.items()}
  return decoded_dict

encode_dict_to_json(n_pair_similarities, 'n_pair_similarities')
print(decode_json_to_dict("n_pair_similarities"))

Dictionary saved as  n_pair_similarities
{('glass', 'metal'): 0.556, ('bird', 'crane'): 0.738, ('coast', 'shore'): 0.91, ('championship', 'tournament'): 0.836, ('football', 'soccer'): 0.903, ('planet', 'moon'): 0.808, ('planet', 'star'): 0.845, ('professor', 'doctor'): 0.662, ('medium', 'radio'): 0.742, ('precedent', 'antecedent'): 0.604, ('murder', 'manslaughter'): 0.853, ('bread', 'butter'): 0.619, ('shower', 'thunderstorm'): 0.631, ('dollar', 'buck'): 0.922, ('king', 'queen'): 0.858, ('lobster', 'food'): 0.781, ('Harvard', 'Yale'): 0.813, ('precedent', 'example'): 0.585, ('car', 'automobile'): 0.894, ('tiger', 'feline'): 0.8, ('cup', 'object'): 0.369, ('century', 'year'): 0.759, ('furnace', 'stove'): 0.879, ('tiger', 'fauna'): 0.562, ('tiger', 'jaguar'): 0.8, ('tiger', 'mammal'): 0.685, ('opera', 'performance'): 0.688, ('money', 'cash'): 0.915, ('vodka', 'gin'): 0.846, ('seafood', 'food'): 0.834, ('cell', 'phone'): 0.781, ('boy', 'lad'): 0.883, ('marathon', 'sprint'): 0.747, ('train

### Create an ordered vector/list of human similarity values

We will use this ordered vector to create a matching ordered vector of machine activation similarities, and then compare the two equal-sized vectors at each model component.

Because this data is sparse (not every word is paired with every other word), we can't use a matrix anymore. So let's use an ordered vector/list instead. Save the (ordered) mapping of [word pair tuple] to [index in the list of similarities] so that we can use the same tuple to create the corresponding vector/list of machine similarities

In [4]:
ordered_pairs = sorted(n_pair_similarities.keys()) # lexographically sorted by keys

human_sim_vec = [n_pair_similarities[pair] for pair in ordered_pairs]
ovector_mapping = {pair: i for i, pair in enumerate(ordered_pairs)}

print("Length:", len(human_sim_vec), "pairs")
print("Human similarity vector:", human_sim_vec)
print("Mapping:                ", ovector_mapping)

Length: 97 pairs
Human similarity vector: [0.25, 0.813, 0.65, 0.744, 0.783, 0.756, 0.887, 0.71, 0.738, 0.669, 0.883, 0.619, 0.844, 0.894, 0.781, 0.759, 0.836, 0.91, 0.592, 0.292, 0.215, 0.369, 0.685, 0.763, 0.7, 0.5, 0.922, 0.778, 0.752, 0.442, 0.681, 0.903, 0.663, 0.944, 0.879, 0.896, 0.556, 0.463, 0.742, 0.929, 0.858, 0.592, 0.788, 0.789, 0.781, 0.57, 0.902, 0.525, 0.83, 0.747, 0.742, 0.929, 0.866, 0.915, 0.904, 0.842, 0.853, 0.719, 0.688, 0.713, 0.735, 0.577, 0.808, 0.845, 0.802, 0.604, 0.585, 0.662, 0.763, 0.558, 0.808, 0.671, 0.759, 0.834, 0.87, 0.631, 0.622, 0.888, 0.688, 0.644, 0.681, 0.677, 0.7, 0.708, 0.735, 0.562, 0.8, 0.8, 0.685, 0.477, 1.0, 0.631, 0.5, 0.897, 0.813, 0.846, 0.773]
Mapping:                 {('Arafat', 'Jackson'): 0, ('Harvard', 'Yale'): 1, ('Japanese', 'American'): 2, ('Mexico', 'Brazil'): 3, ('aluminum', 'metal'): 4, ('announcement', 'news'): 5, ('asylum', 'madhouse'): 6, ('bird', 'cock'): 7, ('bird', 'crane'): 8, ('bishop', 'rabbi'): 9, ('boy', 'lad'): 10, 

In [5]:
%pip install rsatoolbox
%pip install transformer_lens
%pip install circuitsvis
#%pip install numpy==1.26.4
# Install a faster Node version
!curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs  # noqa

# Import utils

import circuitsvis as cv
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import pandas as pd
import plotly.express as px

from jaxtyping import Float
from functools import partial

# Import transformer_lens

import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix

# set automatic differentiation off to save memory, bc only inference here, no training
# TODO: switch this if training/fine-tuning/interfering
torch.set_grad_enabled(False)

# Plotting helper functions

def imshow(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer)

def line(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.line(utils.to_numpy(tensor), labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

# Model setup

device = utils.get_device()
model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# Cache all activations (do less to save memory)

gpt2_text = "What is the difference between a pet fish and a fish?"
gpt2_tokens = model.to_tokens(gpt2_text)
print(gpt2_tokens.device)
gpt2_logits, gpt2_cache = model.run_with_cache(gpt2_tokens, remove_batch_dim=True)

Collecting circuitsvis
  Using cached circuitsvis-1.43.3-py3-none-any.whl.metadata (983 bytes)
Using cached circuitsvis-1.43.3-py3-none-any.whl (1.8 MB)
Installing collected packages: circuitsvis
Successfully installed circuitsvis-1.43.3



    [1m[4m Node.js 16.x is no longer actively supported![m

  [1mYou will not receive security or critical stability updates[m for this version.

  You should migrate to a supported version of Node.js as soon as possible.
  Use the installation script that corresponds to the version of Node.js you
  wish to install. e.g.
  
   * [31mhttps://deb.nodesource.com/setup_16.x — Node.js 16 "Gallium" [1m(deprecated)[m
   * [32mhttps://deb.nodesource.com/setup_18.x — Node.js 18 "Hydrogen" (Maintenance)[m
   * [31mhttps://deb.nodesource.com/setup_19.x — Node.js 19 "Nineteen" [1m(deprecated)[m
   * [1m[32mhttps://deb.nodesource.com/setup_20.x — Node.js 20 LTS "Iron" (recommended)[m
   * [32mhttps://deb.nodesource.com/setup_21.x — Node.js 21 "I

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer
cpu


### Run inputs through GPT-2 small, cache activations, run cosine similarity between each layer's activations for each input

In [6]:
# normalized human similiarity values in the same order as word pairs
human_similarities = human_sim_vec
print("Ordered vector:", human_similarities)
print("Has", len(human_similarities), "values; corresponding to 97 pairs")

Ordered vector: [0.25, 0.813, 0.65, 0.744, 0.783, 0.756, 0.887, 0.71, 0.738, 0.669, 0.883, 0.619, 0.844, 0.894, 0.781, 0.759, 0.836, 0.91, 0.592, 0.292, 0.215, 0.369, 0.685, 0.763, 0.7, 0.5, 0.922, 0.778, 0.752, 0.442, 0.681, 0.903, 0.663, 0.944, 0.879, 0.896, 0.556, 0.463, 0.742, 0.929, 0.858, 0.592, 0.788, 0.789, 0.781, 0.57, 0.902, 0.525, 0.83, 0.747, 0.742, 0.929, 0.866, 0.915, 0.904, 0.842, 0.853, 0.719, 0.688, 0.713, 0.735, 0.577, 0.808, 0.845, 0.802, 0.604, 0.585, 0.662, 0.763, 0.558, 0.808, 0.671, 0.759, 0.834, 0.87, 0.631, 0.622, 0.888, 0.688, 0.644, 0.681, 0.677, 0.7, 0.708, 0.735, 0.562, 0.8, 0.8, 0.685, 0.477, 1.0, 0.631, 0.5, 0.897, 0.813, 0.846, 0.773]
Has 97 values; corresponding to 97 pairs


In [7]:
# Extraction of all the unique words that exist within all the pairs
items = sorted(set([w for pair in ovector_mapping for w in pair]))
print("All unique items:", items)
print("Has", len(items), "items; < 97 * 2 since some duplicate items")

All unique items: ['American', 'Arafat', 'Brazil', 'Harvard', 'Jackson', 'Japanese', 'Mexico', 'Yale', 'activity', 'aluminum', 'animal', 'announcement', 'antecedent', 'artifact', 'asylum', 'automobile', 'avenue', 'basketball', 'bird', 'bishop', 'block', 'boy', 'brandy', 'bread', 'buck', 'butter', 'calculation', 'car', 'carnivore', 'cash', 'cat', 'cell', 'century', 'championship', 'chemistry', 'coast', 'cock', 'computation', 'crane', 'cucumber', 'cup', 'currency', 'death', 'discipline', 'dividend', 'doctor', 'dollar', 'entity', 'equipment', 'example', 'eye', 'fauna', 'feline', 'food', 'football', 'forest', 'fruit', 'fuck', 'furnace', 'gem', 'gin', 'glass', 'governor', 'hospital', 'infrastructure', 'jaguar', 'jazz', 'jewel', 'journey', 'kilometer', 'kind', 'king', 'lad', 'life', 'liquid', 'lobster', 'loss', 'madhouse', 'magician', 'mammal', 'man', 'manslaughter', 'marathon', 'medium', 'metal', 'midday', 'mile', 'money', 'moon', 'murder', 'museum', 'news', 'noon', 'nurse', 'object', 'oper

The goal is to run each unique word through the model, cache all activations, and then use your dictionary/mapping to compute a "machine similarity score" for EACH layer/component (each hook, basically), ONLY for the pairs of words that have human similarity data.

#### Cache all model activations at each input

In [8]:
# Logits aren't used for RSA - don't store logits
# into activation_map to reduce memory usage
activation_maps = {}
for word in items:
  toks = model.to_tokens(" " + word, prepend_bos=False)
  _, cache = model.run_with_cache(toks, remove_batch_dim=True)
  activation_maps[word] = cache

In [9]:
# DEBUG

for i, pair in enumerate(activation_maps.items()):
  if i == 3:
    break
  word, cache = pair
  print(word, len(cache['blocks.0.hook_resid_pre'][0]), cache['blocks.0.hook_resid_pre'][0][:15])

American 768 tensor([ 0.0945, -0.1652,  0.1330, -0.0108,  0.1625, -0.1313, -0.1830, -0.3221,
         0.0916,  0.0322, -0.1395,  0.0370, -0.1284,  0.0327,  0.0552])
Arafat 768 tensor([-0.1354, -0.4119,  0.1131, -0.2338, -0.0324, -0.0400, -0.1873, -0.4047,
        -0.1659,  0.1896,  0.0565,  0.0560, -0.0835,  0.0892,  0.0402])
Brazil 768 tensor([ 0.0135, -0.1880,  0.1372, -0.1603, -0.0420, -0.0426, -0.1849, -0.2772,
        -0.1402, -0.1242, -0.1159,  0.0185,  0.1759,  0.2225, -0.0959])


In [10]:
def get_vector_from_cache(cache, name, layer):
  """
  Grabs the activation matrix for a chosen hook and layer from the word's cache,
  returns as NumPy vector, suitable for cosine similarity analysis

  cache: the full set of intermediate activations produced by some model for some word
  name:  {"resid_post", "mlp_out", "attn_out", "resid_pre", "hook_embed"}
  layer: 0-11 (since we're using GPT-2)
  """
  vector = cache[name, layer].mean(dim=0) # <- mean pool
  return vector.detach().float().cpu().numpy()

In [11]:
# Hooks of interest
hooks = (
    [("resid_pre",  l) for l in range(model.cfg.n_layers)] +
    [("attn_out",   l) for l in range(model.cfg.n_layers)] +
    [("mlp_out",    l) for l in range(model.cfg.n_layers)] +
    [("resid_post", l) for l in range(model.cfg.n_layers)]
)

# word_to_hook_vecs is a dictionary of dictionaries:
# Each word in _items_ is a key in word_to_hook_vecs
# which maps to a dictionary containing the hooks and
# the respective (mean-pooled) vectors
#
# word_to_hook_vecs[word] = {
#    'resid_pre' :   [array],
#    'attn_out'  :   [array],
#    'mlp_out'   :   [array],
#    'resid_post':   [array]
# }

word_to_hook_vecs = {}
for word, cache in activation_maps.items():
  activations = {}
  for name, layer in hooks:
    hook = f"{name}@{layer}"
    activations[hook] = get_vector_from_cache(cache, name, layer)
  word_to_hook_vecs[word] = activations

In [12]:
# DEBUG

for i, (word, activations) in enumerate(word_to_hook_vecs.items()):
  if i == 3:
    break
  print(word)
  for hook, vec in activations.items():
    print(vec[:5])
    print(f"{hook}: {vec.shape}")

American
[ 0.0944553  -0.16521151  0.13300577 -0.01079076  0.16253051]
resid_pre@0: (768,)
[-0.594525   -0.33607268  1.1510376   0.2532938  -0.7124848 ]
resid_pre@1: (768,)
[-1.0983658 -1.127925   1.2961426 -0.9153471 -0.8540274]
resid_pre@2: (768,)
[-4.077264  -4.4940224 -2.1538067 -3.855942  -3.6578398]
resid_pre@3: (768,)
[-4.3700905 -4.806858  -2.3877115 -4.170375  -3.7934341]
resid_pre@4: (768,)
[-4.7689824 -5.122589  -2.704484  -4.4823017 -3.9399948]
resid_pre@5: (768,)
[-4.9054565 -5.32148   -2.8041227 -4.6859784 -3.8626852]
resid_pre@6: (768,)
[-5.0885167 -5.392672  -2.8365948 -4.887884  -3.7888205]
resid_pre@7: (768,)
[-5.1851473 -5.3358264 -3.0032535 -5.072492  -3.7355182]
resid_pre@8: (768,)
[-5.185441  -5.3038073 -3.0961907 -5.2147365 -3.7984648]
resid_pre@9: (768,)
[-5.2868166 -5.23069   -3.3410463 -5.330672  -3.7558248]
resid_pre@10: (768,)
[-5.383737  -5.043421  -3.668625  -5.4208074 -3.8021927]
resid_pre@11: (768,)
[-1.0187595  -0.7319456   1.3075768   0.00590296  0.009

#### Compute cosine similarities

In [13]:
import numpy as np

# Make human_vec (just a numpy'd version of
# human_sim_vec for purposes of computation)
human_vec = np.array(human_sim_vec)
human_vec

array([0.25 , 0.813, 0.65 , 0.744, 0.783, 0.756, 0.887, 0.71 , 0.738,
       0.669, 0.883, 0.619, 0.844, 0.894, 0.781, 0.759, 0.836, 0.91 ,
       0.592, 0.292, 0.215, 0.369, 0.685, 0.763, 0.7  , 0.5  , 0.922,
       0.778, 0.752, 0.442, 0.681, 0.903, 0.663, 0.944, 0.879, 0.896,
       0.556, 0.463, 0.742, 0.929, 0.858, 0.592, 0.788, 0.789, 0.781,
       0.57 , 0.902, 0.525, 0.83 , 0.747, 0.742, 0.929, 0.866, 0.915,
       0.904, 0.842, 0.853, 0.719, 0.688, 0.713, 0.735, 0.577, 0.808,
       0.845, 0.802, 0.604, 0.585, 0.662, 0.763, 0.558, 0.808, 0.671,
       0.759, 0.834, 0.87 , 0.631, 0.622, 0.888, 0.688, 0.644, 0.681,
       0.677, 0.7  , 0.708, 0.735, 0.562, 0.8  , 0.8  , 0.685, 0.477,
       1.   , 0.631, 0.5  , 0.897, 0.813, 0.846, 0.773])

In [14]:
# Organized list of activations that we'll use
# for cosine similarity
#['attn_out@0',
# 'attn_out@1',
# 'attn_out@10',
# 'attn_out@11',
# 'attn_out@2',
#     ...
# 'resid_pre@4',
# 'resid_pre@5',
# 'resid_pre@6',
# 'resid_pre@7',
# 'resid_pre@8',
# 'resid_pre@9']
hook_tags = sorted(next(iter(word_to_hook_vecs.values())).keys())

# List of pairs from WordSim
#[('Arafat', 'Jackson'),
# ('Harvard', 'Yale'),
# ('Japanese', 'American'),
#           ...
# ('aluminum', 'metal'),
# ('announcement', 'news'),
# ('asylum', 'madhouse')]
pairs = list(ovector_mapping.keys())

In [15]:
from numpy.linalg import norm

def cosine(u, v):
  nu, nv = norm(u), norm(v)
  return float(np.dot(u, v)/(nu*nv)) if (nu and nv) else np.nan

cosine_by_hook = {}

# Generate machine_sim for each hook
for tag in hook_tags:
  machine_sim = []
  for (w1, w2) in pairs:
    v1 = word_to_hook_vecs[w1][tag]
    v2 = word_to_hook_vecs[w2][tag]
    machine_sim.append(cosine(v1, v2))
  cosine_by_hook[tag] = np.asarray(machine_sim, dtype=float)

# Display result as table
cosine_df = pd.DataFrame(
  {tag: cosine_by_hook[tag] for tag in hook_tags},
  index=pairs
)
display(cosine_df)

Unnamed: 0,attn_out@0,attn_out@1,attn_out@10,attn_out@11,attn_out@2,attn_out@3,attn_out@4,attn_out@5,attn_out@6,attn_out@7,...,resid_pre@10,resid_pre@11,resid_pre@2,resid_pre@3,resid_pre@4,resid_pre@5,resid_pre@6,resid_pre@7,resid_pre@8,resid_pre@9
"(Arafat, Jackson)",0.726338,0.871980,0.931544,0.998285,0.899560,0.996132,0.988263,0.991256,0.973145,0.978686,...,0.999237,0.997553,0.994681,0.999672,0.999710,0.999717,0.999715,0.999696,0.999577,0.999469
"(Harvard, Yale)",0.942272,0.992928,0.989518,1.000000,0.993333,0.999420,0.997353,0.998575,0.997024,0.995956,...,0.999987,0.999986,0.999641,0.999977,0.999982,0.999984,0.999986,0.999987,0.999987,0.999987
"(Japanese, American)",0.898009,0.972105,0.984077,1.000000,0.977913,0.998193,0.992347,0.996942,0.992721,0.992320,...,0.999976,0.999975,0.999360,0.999964,0.999970,0.999973,0.999975,0.999976,0.999977,0.999977
"(Mexico, Brazil)",0.919369,0.991951,0.989980,1.000000,0.991800,0.999164,0.996851,0.998404,0.996629,0.995908,...,0.999987,0.999987,0.999646,0.999981,0.999983,0.999985,0.999986,0.999987,0.999987,0.999987
"(aluminum, metal)",0.919519,0.985299,0.988970,1.000000,0.987630,0.998991,0.995462,0.997775,0.994960,0.995918,...,0.999981,0.999980,0.999577,0.999958,0.999972,0.999977,0.999979,0.999980,0.999981,0.999981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(travel, activity)",0.888831,0.970361,0.982840,1.000000,0.974468,0.998125,0.992250,0.996055,0.992548,0.991665,...,0.999970,0.999969,0.999151,0.999956,0.999962,0.999966,0.999969,0.999970,0.999971,0.999971
"(type, kind)",0.931656,0.983732,0.989876,1.000000,0.979857,0.998684,0.994141,0.997278,0.993874,0.993992,...,0.999978,0.999977,0.999420,0.999962,0.999970,0.999974,0.999976,0.999977,0.999978,0.999978
"(vodka, brandy)",0.865418,0.906386,0.889812,0.997130,0.949270,0.995186,0.984927,0.992434,0.946382,0.963299,...,0.999083,0.997158,0.996104,0.999791,0.999808,0.999802,0.999787,0.999754,0.999641,0.999461
"(vodka, gin)",0.908244,0.978006,0.981754,1.000000,0.985929,0.998446,0.993334,0.997286,0.992814,0.993684,...,0.999977,0.999976,0.999377,0.999965,0.999971,0.999974,0.999976,0.999977,0.999977,0.999977


### Compute RSA (pearson correlation) between human and machine similarity vectors for each layer

In [16]:
from scipy.stats import pearsonr

rsa_scores = {}

for hook, machine_vec in cosine_by_hook.items():
  r, _ = pearsonr(machine_vec, human_vec)
  rsa_scores[hook] = r

top_50 = sorted(rsa_scores.items(), key=lambda x: x[1], reverse=True)[:50]

for hook, score in top_50:
  print(f"{hook}: {score:.4f}")

attn_out@0: 0.5531
resid_pre@0: 0.3552
mlp_out@0: 0.2517
resid_post@0: 0.2307
resid_pre@1: 0.2307
mlp_out@1: 0.2105
attn_out@1: 0.2087
attn_out@10: 0.1925
attn_out@9: 0.1885
mlp_out@10: 0.1659
resid_post@11: 0.1577
mlp_out@11: 0.1500
attn_out@2: 0.1495
attn_out@8: 0.1419
resid_post@1: 0.1403
resid_pre@2: 0.1403
mlp_out@8: 0.1380
mlp_out@9: 0.1317
resid_post@2: 0.1302
resid_pre@3: 0.1302
resid_post@3: 0.1297
resid_pre@4: 0.1297
mlp_out@2: 0.1290
resid_post@4: 0.1240
resid_pre@5: 0.1240
mlp_out@7: 0.1234
resid_post@7: 0.1185
resid_pre@8: 0.1185
mlp_out@4: 0.1183
resid_post@5: 0.1169
resid_pre@6: 0.1169
attn_out@11: 0.1137
resid_post@6: 0.1114
resid_pre@7: 0.1114
resid_post@8: 0.1110
resid_pre@9: 0.1110
resid_post@10: 0.1087
resid_pre@11: 0.1087
mlp_out@6: 0.1081
resid_post@9: 0.1029
resid_pre@10: 0.1029
attn_out@7: 0.0997
mlp_out@5: 0.0980
mlp_out@3: 0.0662
attn_out@3: 0.0327
attn_out@4: 0.0286
attn_out@6: 0.0243
attn_out@5: 0.0231
