In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm import tqdm

import os
import sys
sys.path.append('..')

from relations import estimate
from util import model_utils
from dsets.counterfact import CounterFactDataset
from util import nethook

In [4]:
MODEL_NAME = "EleutherAI/gpt-j-6B"  # gpt2-{medium,large,xl} or EleutherAI/gpt-j-6B
mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float16)

model = mt.model
tokenizer = mt.tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [5]:
ln_f = nethook.get_module(model, "transformer.ln_f")
lm_head = nethook.get_module(model, "lm_head")

def get_vocab_representation(h, perform_layer_norm = True, return_top_k = 5):
    z = h.clone()
    if(perform_layer_norm == True):
        z = ln_f(z)
    logits = lm_head(z)
    token_ids = logits.topk(dim=-1, k=return_top_k).indices.squeeze().tolist()
    return [
        tokenizer.decode(t) for t in token_ids
    ]

In [6]:
test_cases = [
    ("The Great Wall", -1, "China"),
    ("Niagara Falls", -2, "Canada"),
    ("Valdemarsvik", -1, "Sweden"),
    ("Kyoto University", -2, "Japan"),
    ("Hattfjelldal", -1, "Norway"),
    ("Ginza", -1, "Japan"),
    ("Sydney Hospital", -2, "Australia"),
    ("Mahalangur Himal", -1, "Nepal"),
    ("Higashikagawa", -1, "Japan"),
    ("Trento", -1, "Italy"),
    ("Taj Mahal", -1, "India")
]

def check_with_test_cases(relation_operator):
    for subject, subject_token_index, target in test_cases:
        objects = relation_operator(
            subject,
            subject_token_index=subject_token_index,
            device=model.device,
            return_top_k=5,
        )
        print(f"{subject}, target: {target}   ==>   predicted: {objects}")

In [9]:
# good cases

space_needle = estimate.estimate_relation_operator(
    model, tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=15,
    device=model.device,
)

print(get_vocab_representation(space_needle.bias))

[' Seattle', ' Washington', ' the', ' Sweden', ' Canada']


In [10]:
check_with_test_cases(space_needle)

The Great Wall, target: China   ==>   predicted: [' China', ' Seattle', ' Beijing', ' Taiwan', ' Japan']
Niagara Falls, target: Canada   ==>   predicted: [' Niagara', ' Canada', 'Ni', ' New', ' Ontario']
Valdemarsvik, target: Sweden   ==>   predicted: [' Sweden', ' Seattle', ' Washington', ' Scandinav', ' Swedish']
Kyoto University, target: Japan   ==>   predicted: [' Kyoto', ' Japan', ' Osaka', ' Tokyo', ' Japanese']
Hattfjelldal, target: Norway   ==>   predicted: [' Norway', ' Sweden', ' Scandinav', ' Denmark', ' Washington']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Tokyo', ' Seattle', ' Osaka', ' Shin']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Queensland', ' New']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Seattle', ' Washington', ' Bh', ' Switzerland']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' Tokyo', ' Seattle', ' Osaka', ' Kyoto']
Trento, target: Italy   ==>   predicted: ['

## Interpreting Biases

In [18]:
calculated_relations = np.load(
    "gpt-j/P17/cached_JB/jacobian_calculations__all_sub_toks__layer_15___0_to_100.npz",
    allow_pickle=True
)["jacobians"]

In [19]:
len(calculated_relations)

100

In [32]:
calculated_relations[0]['all_weights_and_biases'][-1]['misc'].keys()

dict_keys(['Jh_norm', 'bias_norm', 'h_info', 'consider_residual'])

In [28]:
for relation_collention in calculated_relations:
    relation_operator = relation_collention['all_weights_and_biases'][-1]
    print(f"({relation_collention['subject']}, {relation_collention['request']['prompt']}, {relation_collention['request']['target_true']['str']})")
    bias = torch.tensor(relation_operator["bias"], device=model.device)
    print(get_vocab_representation(bias))

(Autonomous University of Madrid, {}, which is located in, Spain)
[' Spain', ' Madrid', ' the', ' And', ' Cast']
(Kuala Langat, {}, located in, Malaysia)
[' Malaysia', ' Pen', ' Sel', ' Sab', ' Mal']
(Wanne-Eickel Central Station, {}, located in, Germany)
[' Germany', ' Han', ' North', ' Bad', ' Sch']
(Bastille, {}, which is located in, France)
[' France', ' the', ' Belgium', ' Luxembourg', ' Brittany']
(Shablykinsky District, {} is located in the country of, Russia)
[' Russia', ' Ukraine', ' Kazakhstan', ' Belarus', ' the']
(Valdemarsvik, {}, which is located in, Sweden)
[' Sweden', ' Latvia', ' S', ' J', ' Estonia']
(Attingal, {}, which is located in, India)
[' India', ' Kerala', ' Go', ' And', ' Karn']
(Nizampatnam, {} is located in the country of, India)
[' And', ' India', ' Od', ' Tel', ' Tamil']
(Darmstadt, {} is located in the country of, Germany)
[' Germany', ' H', ' the', ' Bad', ' North']
(Adliswil, {}, which is located in, Switzerland)
[' Switzerland', ' the', ' Cant', ' Z',

In [26]:
check_with_test_cases(space_needle)

The Great Wall, target: China   ==>   predicted: [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Niagara Falls, target: Canada   ==>   predicted: [' Canada', ' Ontario', ' Niagara', ' New', ' British']
Valdemarsvik, target: Sweden   ==>   predicted: [' Iceland', ' Denmark', ' Sweden', ' Finland', ' Norway']
Kyoto University, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Finland', ' Hawaii', ' Tokyo']
Hattfjelldal, target: Norway   ==>   predicted: [' Iceland', ' Denmark', ' Norway', ' Sweden', ' Finland']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Singapore', ' China', ' Seattle', ' Hong']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Singapore', ' Canberra']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Tibet', ' Bh', ' Nep', ' China']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Tokyo', ' Canada', ' Seattle']
Trento, target: Italy   ==>   predicted: [' Sweden'

In [22]:
space_needle_2 = estimate.estimate_relation_operator(
    model, tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    calculate_at_lnf= True
)

print(get_vocab_representation(space_needle_2.bias))

[' Nam', ' Beau', ' Krug', ' Macron', ' Cham']


In [27]:
check_with_test_cases(space_needle_2)

The Great Wall, target: China   ==>   predicted: [' China', ' Shen', ' Chinese', ' Qing', ' Beijing']
Niagara Falls, target: Canada   ==>   predicted: [' Niagara', 'Toronto', ' Ontario', ' Cuomo', ' Erie']
Valdemarsvik, target: Sweden   ==>   predicted: [' Nordic', ' Greenland', 'vik', ' Swedish', ' Icelandic']
Kyoto University, target: Japan   ==>   predicted: ['Japanese', ' Japanese', 'Tok', 'Japan', ' Japan']
Hattfjelldal, target: Norway   ==>   predicted: [' Nordic', ' Denmark', ' Iceland', ' Scandinavian', ' Icelandic']
Ginza, target: Japan   ==>   predicted: [' Tokyo', 'Tok', ' Japan', 'Japan', ' Japanese']
Sydney Hospital, target: Australia   ==>   predicted: [' Sydney', ' NSW', ' Australia', ' Australian', 'Australia']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Himal', ' Tibetan', ' Nepal', ' Nep', ' Tibet']
Higashikagawa, target: Japan   ==>   predicted: [' Japanese', 'Japanese', ' Japan', ' Tokyo', 'Tok']
Trento, target: Italy   ==>   predicted: ['meta', ' Budapest

In [28]:
menangle = estimate.estimate_relation_operator(
    model, tokenizer,
    "Menangle Park",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    subject_token_index = 1
)

get_vocab_representation(menangle.bias)

[' Australia', ' the', ' New', ' Ireland', ' South']

In [29]:
check_with_test_cases(menangle)

The Great Wall, target: China   ==>   predicted: [' China', ' India', ' Hong', ' the', ' Z']
Niagara Falls, target: Canada   ==>   predicted: [' Newfoundland', ' Ontario', ' New', ' Ireland', ' Canada']
Valdemarsvik, target: Sweden   ==>   predicted: [' Australia', ' New', ' the', ' Canada', ' Bh']
Kyoto University, target: Japan   ==>   predicted: [' India', ' Bh', ' Nepal', ' Thailand', ' Sri']
Hattfjelldal, target: Norway   ==>   predicted: [' Ireland', ' Scotland', ' the', ' Australia', ' Wales']
Ginza, target: Japan   ==>   predicted: [' India', ' Australia', ' Hong', ' Japan', ' Bh']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Queensland', ' NSW', ' New']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Bh', ' Nepal', ' India', ' Sri', ' Kashmir']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' India', ' New', ' Australia', ' the']
Trento, target: Italy   ==>   predicted: [' New', ' Australia', ' Ireland', ' the', ' France']

In [30]:
menangle_2 = estimate.estimate_relation_operator(
    model, tokenizer,
    "Menangle Park",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    subject_token_index = 1,
    calculate_at_lnf= True
)

get_vocab_representation(menangle_2.bias)

[' ACT', ' NSW', ' Sydney', ' Brisbane', ' Sutherland']

In [33]:
check_with_test_cases(menangle_2)

The Great Wall, target: China   ==>   predicted: ['\x0b', '�', '\r', '龍�', '\x0f']
Niagara Falls, target: Canada   ==>   predicted: [' Niagara', '\r', '�', '�', '\x0b']
Valdemarsvik, target: Sweden   ==>   predicted: ['�', '\x0b', '�', '\r', '龍�']
Kyoto University, target: Japan   ==>   predicted: ['�', '\x0b', '\r', '龍�', '�']
Hattfjelldal, target: Norway   ==>   predicted: ['�', '\r', '�', '\x0b', '\x11']
Ginza, target: Japan   ==>   predicted: ['\x0b', '�', '\r', '�', '龍�']
Sydney Hospital, target: Australia   ==>   predicted: [' Sydney', ' NSW', '�', '\x0b', '\r']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Himal', ' Nepal', ' Bh', ' Hindu', ' Kashmir']
Higashikagawa, target: Japan   ==>   predicted: ['�', '\x0b', '\r', '�', 'rawdownload']
Trento, target: Italy   ==>   predicted: ['�', '\x0b', '\r', '\x1c', '�']
Taj Mahal, target: India   ==>   predicted: ['�', '\r', '\x0b', '\x08', '龍�']


In [None]:
(space_needle.bias - menangle.bias).norm()

tensor(269.9868, device='cuda:0')

## Difference with USUAL calculation

In [11]:
final_layer = f"transformer.h.{model.config.n_layer-1}"

In [12]:
prompt = "The Space Needle is located in the city of"

tokenized = tokenizer(prompt, padding=True, return_tensors="pt").to(
    next(model.parameters()).device
)

with nethook.TraceDict(
    model, [final_layer, "transformer.ln_f"]
) as traces:  
    output = model(**tokenized)

In [13]:
get_vocab_representation(traces[final_layer].output[0][0][-1])

[' Seattle', ' Seat', ' the', ' Portland', ' Space']

In [14]:
get_vocab_representation(traces["transformer.ln_f"].output[0][-1], perform_layer_norm= False)

[' Seattle', ' Seat', ' the', ' Portland', ' Space']

In [15]:
logits = output.logits[:, -1, :]
token_ids = logits.topk(dim=-1, k=5).indices.squeeze().tolist()
[tokenizer.decode(t) for t in token_ids]

[' Seattle', ' Seat', ' the', ' Portland', ' Space']

In [17]:
for subject, subject_token_index, target in test_cases:
    objects, z_est = space_needle(
        subject,
        subject_token_index=subject_token_index,
        device=model.device,
        return_top_k=5,
    )
    print(f"{subject}, target: {target}")

    prompt = "{} is located in the country of".format(subject)
    tokenized = tokenizer(prompt, padding=True, return_tensors="pt").to(next(model.parameters()).device)
    with nethook.TraceDict(
        model, [final_layer, "transformer.ln_f"]
    ) as traces:  
        output = model(**tokenized)
    z = traces[final_layer].output[0][0][-1]

    print("z_ = ", get_vocab_representation(z))
    print("z_est = ", get_vocab_representation(z_est))
    print("Distance => ", torch.dist(z, z_est).item())
    print()
    

The Great Wall, target: China
z_ =  [' China', ' G', ' Shan', ' the', ' Great']
z_est =  [' China', ' Seattle', ' Beijing', ' Taiwan', ' Japan']
Distance =>  169.125

Niagara Falls, target: Canada
z_ =  [' Canada', ' Ontario', ' New', ' the', ' Niagara']
z_est =  [' Niagara', ' Canada', 'Ni', ' New', ' Ontario']
Distance =>  180.25

Valdemarsvik, target: Sweden
z_ =  [' Sweden', ' S', ' J', ' Latvia', ' V']
z_est =  [' Sweden', ' Seattle', ' Washington', ' Scandinav', ' Swedish']
Distance =>  179.25

Kyoto University, target: Japan
z_ =  [' Japan', ' Kyoto', ' Ky', ' the', ' H']
z_est =  [' Kyoto', ' Japan', ' Osaka', ' Tokyo', ' Japanese']
Distance =>  163.25

Hattfjelldal, target: Norway
z_ =  [' Norway', ' Nord', ' S', ' Tr', ' Finn']
z_est =  [' Norway', ' Sweden', ' Scandinav', ' Denmark', ' Washington']
Distance =>  189.75

Ginza, target: Japan
z_ =  [' Japan', ' Tokyo', ' the', ' Ch', ' Shin']
z_est =  [' Japan', ' Tokyo', ' Seattle', ' Osaka', ' Shin']
Distance =>  150.75

Sydn

## Which contributes the most? $Jh$ or $bias$?

In [46]:
os.listdir("gpt-j/P17/cached_JB/")

['jacobian_calculations__all_sub_toks__layer_15___200_to_300.npz',
 'jacobian_calculations__all_sub_toks__layer_15___300_to_400.npz',
 'jacobian_calculations__all_sub_toks__layer_15___400_to_404.npz',
 'jacobian_calculations__all_sub_toks__layer_15___100_to_200.npz',
 'jacobian_calculations__all_sub_toks__layer_15___0_to_100.npz']

In [50]:
jh_collection = []
bias_collection = []
rank_collection = []

path = "gpt-j/P17/cached_JB/"
cached_jb_files = os.listdir(path)

for jb_file in cached_jb_files:
    print("loading --> ", jb_file)
    calculated_relations = np.load(f"{path}{jb_file}", allow_pickle=True)["jacobians"]
    for relation_collention in tqdm(calculated_relations):
        relation_operator = relation_collention['all_weights_and_biases'][-1]
        jh_collection.append(relation_operator['misc']['Jh_norm'])
        bias_collection.append(relation_operator['misc']['bias_norm'])
        J = torch.tensor(relation_operator["weight"], device=model.device, dtype=torch.float32)
        rank_collection.append(torch.linalg.matrix_rank(J).item())

loading -->  jacobian_calculations__all_sub_toks__layer_15___200_to_300.npz


100%|██████████| 100/100 [03:39<00:00,  2.20s/it]


loading -->  jacobian_calculations__all_sub_toks__layer_15___300_to_400.npz


100%|██████████| 100/100 [03:40<00:00,  2.20s/it]


loading -->  jacobian_calculations__all_sub_toks__layer_15___400_to_404.npz


100%|██████████| 4/4 [00:08<00:00,  2.16s/it]


loading -->  jacobian_calculations__all_sub_toks__layer_15___100_to_200.npz


100%|██████████| 100/100 [03:41<00:00,  2.21s/it]


loading -->  jacobian_calculations__all_sub_toks__layer_15___0_to_100.npz


100%|██████████| 100/100 [03:40<00:00,  2.21s/it]


In [52]:
jh_collection = np.array(jh_collection)
bias_collection = np.array(bias_collection)
rank_collection = np.array(rank_collection)

In [53]:
f"{jh_collection.mean()} +/- {jh_collection.std()}"

'12.675278465346535 +/- 8.163360430617857'

In [54]:
f"{bias_collection.mean()} +/- {bias_collection.std()}"

'254.11355198019803 +/- 23.25887094834684'

In [55]:
f"{rank_collection.mean()} +/- {rank_collection.std()}"

'3788.05198019802 +/- 115.54177064337186'

## Good case vs Bad cases

In [56]:
# (s, o, h_idx)
relation_format = '{} is located in the country of'
good_cases = [
    ('Haut Atlas', 'Morocco', 2),
    ('Pamukkale', 'Turkey', 4),
    ('Fort Madalena', 'Malta', 3),
    ('Umarex', 'Germany', 2),
    ('Qatar Ladies Open', 'Qatar', 2),
    ('Sydney Peace Prize', 'Australia', 4),
    ('Alte Nationalgalerie', 'Germany', 4),
]

In [58]:
good_relations = []
for subject, object, h_idx in tqdm(good_cases):
    relation = estimate.estimate_relation_operator(
        model, tokenizer,
        subject, relation_format,
        subject_token_index= h_idx,
        layer = 15,
        device= model.device
    )
    good_relations.append(relation)

100%|██████████| 7/7 [00:29<00:00,  4.25s/it]


In [61]:
weights = [r.weight for r in good_relations]
bias = [r.bias for r in good_relations]

In [66]:
for one in range(len(weights)):
    w1 = weights[one]
    dist = [round(torch.dist(w1, w2).item(),3) for w2 in weights]
    print(round(w1.norm().item(), 3), dist)

19.188 [0.0, 38.375, 21.688, 18.078, 23.047, 21.859, 26.297]
39.75 [38.375, 0.0, 36.188, 38.625, 38.406, 40.906, 38.406]
20.594 [21.688, 36.188, 0.0, 21.422, 23.797, 24.375, 26.688]
15.312 [18.078, 38.625, 21.422, 0.0, 22.859, 19.344, 24.812]
23.0 [23.047, 38.406, 23.797, 22.859, 0.0, 25.266, 27.953]
18.828 [21.859, 40.906, 24.375, 19.344, 25.266, 0.0, 27.016]
26.5 [26.297, 38.406, 26.688, 24.812, 27.953, 27.016, 0.0]


In [67]:
for one in range(len(bias)):
    w1 = bias[one]
    dist = [round(torch.dist(w1, w2).item(),3) for w2 in bias]
    print(round(w1.norm().item(), 3), dist)

269.75 [0.0, 178.0, 92.938, 92.312, 107.188, 131.125, 137.125]
230.5 [178.0, 0.0, 174.125, 173.5, 186.375, 188.25, 172.75]
253.25 [92.938, 174.125, 0.0, 102.312, 115.125, 131.375, 139.5]
247.25 [92.312, 173.5, 102.312, 0.0, 107.188, 124.312, 122.062]
274.25 [107.188, 186.375, 115.125, 107.188, 0.0, 134.625, 150.25]
246.0 [131.125, 188.25, 131.375, 124.312, 134.625, 0.0, 137.125]
216.75 [137.125, 172.75, 139.5, 122.062, 150.25, 137.125, 0.0]


In [83]:
import random
calculated_relations = np.load(
    "gpt-j/P17/cached_JB/jacobian_calculations__all_sub_toks__layer_15___0_to_100.npz",
    allow_pickle=True
)["jacobians"]

random_indices = random.choices(range(0, len(calculated_relations)), k = 7)
print(random_indices)

weights = []
bias = []

for idx in tqdm(random_indices):
    relation_collention = calculated_relations[idx]
    relation_operator = relation_collention['all_weights_and_biases'][-1]
    weights.append(torch.tensor(relation_operator["weight"], device=model.device, dtype=torch.float32))
    bias.append(torch.tensor(relation_operator["bias"], device=model.device, dtype=torch.float32))

[3, 33, 5, 80, 64, 69, 39]


100%|██████████| 7/7 [00:00<00:00, 93.92it/s]


In [84]:
for one in range(len(weights)):
    w1 = weights[one]
    dist = [round(torch.dist(w1, w2).item(),3) for w2 in weights]
    print(round(w1.norm().item(), 3), dist)

17.549 [0.0, 17.949, 19.386, 27.599, 17.965, 16.597, 23.482]
9.679 [17.949, 0.0, 17.015, 27.033, 10.177, 10.039, 19.596]
16.475 [19.386, 17.015, 0.0, 28.162, 17.053, 15.313, 22.612]
27.394 [27.599, 27.033, 28.162, 0.0, 27.206, 26.912, 31.171]
8.184 [17.965, 10.177, 17.053, 27.206, 0.0, 8.961, 18.904]
5.596 [16.597, 10.039, 15.313, 26.912, 8.961, 0.0, 18.207]
18.04 [23.482, 19.596, 22.612, 31.171, 18.904, 18.207, 0.0]


In [85]:
for one in range(len(bias)):
    w1 = bias[one]
    dist = [round(torch.dist(w1, w2).item(),3) for w2 in bias]
    print(round(w1.norm().item(), 3), dist)

280.816 [0.0, 161.95, 138.882, 134.235, 178.476, 129.226, 172.651]
223.301 [161.95, 0.0, 163.701, 160.271, 165.099, 167.165, 176.24]
259.589 [138.882, 163.701, 0.0, 160.631, 180.087, 109.962, 180.799]
273.735 [134.235, 160.271, 160.631, 0.0, 173.806, 157.031, 169.659]
237.356 [178.476, 165.099, 180.087, 173.806, 0.0, 181.295, 173.081]
250.456 [129.226, 167.165, 109.962, 157.031, 181.295, 0.0, 174.918]
240.872 [172.651, 176.24, 180.799, 169.659, 173.081, 174.918, 0.0]
