In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm import tqdm

import os
import sys
sys.path.append('..')

from relations import estimate
from util import model_utils
from dsets.counterfact import CounterFactDataset
from util import nethook

In [3]:
MODEL_NAME = "gpt2-xl"  # gpt2-{medium,large,xl} or EleutherAI/gpt-j-6B
mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=False)

model = mt.model
tokenizer = mt.tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [4]:
ln_f = nethook.get_module(model, "transformer.ln_f")
lm_head = nethook.get_module(model, "lm_head")

def get_vocab_representation(h, perform_layer_norm = True, return_top_k = 5):
    z = h.clone()
    if(perform_layer_norm == True):
        z = ln_f(z)
    logits = lm_head(z)
    token_ids = logits.topk(dim=-1, k=return_top_k).indices.squeeze().tolist()
    return [
        tokenizer.decode(t) for t in token_ids
    ]

In [5]:
test_cases = [
    ("The Great Wall", -1, "China"),
    ("Niagara Falls", -2, "Canada"),
    ("Valdemarsvik", -1, "Sweden"),
    ("Kyoto University", -2, "Japan"),
    ("Hattfjelldal", -1, "Norway"),
    ("Ginza", -1, "Japan"),
    ("Sydney Hospital", -2, "Australia"),
    ("Mahalangur Himal", -1, "Nepal"),
    ("Higashikagawa", -1, "Japan"),
    ("Trento", -1, "Italy"),
    ("Taj Mahal", -1, "India")
]

def check_with_test_cases(relation_operator):
    for subject, subject_token_index, target in test_cases:
        objects = relation_operator(
            subject,
            subject_token_index=subject_token_index,
            device=model.device,
            return_top_k=5,
        )
        print(f"{subject}, target: {target}   ==>   predicted: {objects}")

In [6]:
# good cases

space_needle = estimate.estimate_relation_operator(
    model, tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=25,
    device=model.device,
)

print(get_vocab_representation(space_needle.bias))

[' Washington', ' Canada', ' the', ' Seattle', ' Japan']


In [7]:
check_with_test_cases(space_needle)

The Great Wall, target: China   ==>   predicted: [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Niagara Falls, target: Canada   ==>   predicted: [' Canada', ' Ontario', ' Niagara', ' New', ' British']
Valdemarsvik, target: Sweden   ==>   predicted: [' Iceland', ' Denmark', ' Sweden', ' Finland', ' Norway']
Kyoto University, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Finland', ' Hawaii', ' Tokyo']
Hattfjelldal, target: Norway   ==>   predicted: [' Iceland', ' Denmark', ' Norway', ' Sweden', ' Finland']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Singapore', ' China', ' Seattle', ' Hong']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Singapore', ' Canberra']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Tibet', ' Bh', ' Nep', ' China']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Tokyo', ' Canada', ' Seattle']
Trento, target: Italy   ==>   predicted: [' Sweden'

## Interpreting Biases

In [5]:
calculated_relations = np.load(
    "cached_jacobians/jacobian_calculations_P17__layer_25.npz",
    allow_pickle=True
)["jacobians"]

In [6]:
len(calculated_relations)

266

In [7]:
for relation in calculated_relations:
    print(f"({relation['subject']}, {relation['request']['prompt']}, {relation['request']['target_true']['str']})")
    bias = torch.tensor(relation["bias"], device=model.device)
    print(get_vocab_representation(bias))

(Autonomous University of Madrid, {}, which is located in, Spain)
[' Spain', ' And', ' Catalonia', ' Gran', ' Se']
(Kuala Langat, {}, located in, Malaysia)
[' Malaysia', ' Indonesia', ' B', ' Thailand', ' Brune']
(Bastille, {}, which is located in, France)
[' France', ' the', ' Brittany', ' Bast', ' Catalonia']
(Valdemarsvik, {}, which is located in, Sweden)
[' Sweden', ' Norway', ' Denmark', ' Iceland', ' Finland']
(Piper Verlag, {}, which is located in, Germany)
[' Germany', ' Austria', ' the', ' Luxembourg', ' Switzerland']
(Tehri Garhwal district, {}, in, India)
[' India', ' Nepal', ' Bh', ' Pakistan', ' Afghanistan']
(Darmstadt, {} is located in the country of, Germany)
[' Germany', ' the', ' Lower', ' Bav', ' Sw']
(Gazzola, {} is located in the country of, Italy)
[' Italy', ' Georgia', ' the', ' France', ' T']
(Borovsky District, {} is located in, Russia)
[' Russia', ' Belarus', ' Georgia', ' Kazakhstan', ' Ukraine']
(Eibenstock, {}, located in, Germany)
[' Germany', ' Austria', 

In [26]:
check_with_test_cases(space_needle)

The Great Wall, target: China   ==>   predicted: [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Niagara Falls, target: Canada   ==>   predicted: [' Canada', ' Ontario', ' Niagara', ' New', ' British']
Valdemarsvik, target: Sweden   ==>   predicted: [' Iceland', ' Denmark', ' Sweden', ' Finland', ' Norway']
Kyoto University, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Finland', ' Hawaii', ' Tokyo']
Hattfjelldal, target: Norway   ==>   predicted: [' Iceland', ' Denmark', ' Norway', ' Sweden', ' Finland']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Singapore', ' China', ' Seattle', ' Hong']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Singapore', ' Canberra']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Tibet', ' Bh', ' Nep', ' China']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Tokyo', ' Canada', ' Seattle']
Trento, target: Italy   ==>   predicted: [' Sweden'

In [22]:
space_needle_2 = estimate.estimate_relation_operator(
    model, tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    calculate_at_lnf= True
)

print(get_vocab_representation(space_needle_2.bias))

[' Nam', ' Beau', ' Krug', ' Macron', ' Cham']


In [27]:
check_with_test_cases(space_needle_2)

The Great Wall, target: China   ==>   predicted: [' China', ' Shen', ' Chinese', ' Qing', ' Beijing']
Niagara Falls, target: Canada   ==>   predicted: [' Niagara', 'Toronto', ' Ontario', ' Cuomo', ' Erie']
Valdemarsvik, target: Sweden   ==>   predicted: [' Nordic', ' Greenland', 'vik', ' Swedish', ' Icelandic']
Kyoto University, target: Japan   ==>   predicted: ['Japanese', ' Japanese', 'Tok', 'Japan', ' Japan']
Hattfjelldal, target: Norway   ==>   predicted: [' Nordic', ' Denmark', ' Iceland', ' Scandinavian', ' Icelandic']
Ginza, target: Japan   ==>   predicted: [' Tokyo', 'Tok', ' Japan', 'Japan', ' Japanese']
Sydney Hospital, target: Australia   ==>   predicted: [' Sydney', ' NSW', ' Australia', ' Australian', 'Australia']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Himal', ' Tibetan', ' Nepal', ' Nep', ' Tibet']
Higashikagawa, target: Japan   ==>   predicted: [' Japanese', 'Japanese', ' Japan', ' Tokyo', 'Tok']
Trento, target: Italy   ==>   predicted: ['meta', ' Budapest

In [28]:
menangle = estimate.estimate_relation_operator(
    model, tokenizer,
    "Menangle Park",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    subject_token_index = 1
)

get_vocab_representation(menangle.bias)

[' Australia', ' the', ' New', ' Ireland', ' South']

In [29]:
check_with_test_cases(menangle)

The Great Wall, target: China   ==>   predicted: [' China', ' India', ' Hong', ' the', ' Z']
Niagara Falls, target: Canada   ==>   predicted: [' Newfoundland', ' Ontario', ' New', ' Ireland', ' Canada']
Valdemarsvik, target: Sweden   ==>   predicted: [' Australia', ' New', ' the', ' Canada', ' Bh']
Kyoto University, target: Japan   ==>   predicted: [' India', ' Bh', ' Nepal', ' Thailand', ' Sri']
Hattfjelldal, target: Norway   ==>   predicted: [' Ireland', ' Scotland', ' the', ' Australia', ' Wales']
Ginza, target: Japan   ==>   predicted: [' India', ' Australia', ' Hong', ' Japan', ' Bh']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Queensland', ' NSW', ' New']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Bh', ' Nepal', ' India', ' Sri', ' Kashmir']
Higashikagawa, target: Japan   ==>   predicted: [' Japan', ' India', ' New', ' Australia', ' the']
Trento, target: Italy   ==>   predicted: [' New', ' Australia', ' Ireland', ' the', ' France']

In [30]:
menangle_2 = estimate.estimate_relation_operator(
    model, tokenizer,
    "Menangle Park",
    "{} is located in the country of",
    layer=25,
    device=model.device,
    subject_token_index = 1,
    calculate_at_lnf= True
)

get_vocab_representation(menangle_2.bias)

[' ACT', ' NSW', ' Sydney', ' Brisbane', ' Sutherland']

In [33]:
check_with_test_cases(menangle_2)

The Great Wall, target: China   ==>   predicted: ['\x0b', '�', '\r', '龍�', '\x0f']
Niagara Falls, target: Canada   ==>   predicted: [' Niagara', '\r', '�', '�', '\x0b']
Valdemarsvik, target: Sweden   ==>   predicted: ['�', '\x0b', '�', '\r', '龍�']
Kyoto University, target: Japan   ==>   predicted: ['�', '\x0b', '\r', '龍�', '�']
Hattfjelldal, target: Norway   ==>   predicted: ['�', '\r', '�', '\x0b', '\x11']
Ginza, target: Japan   ==>   predicted: ['\x0b', '�', '\r', '�', '龍�']
Sydney Hospital, target: Australia   ==>   predicted: [' Sydney', ' NSW', '�', '\x0b', '\r']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Himal', ' Nepal', ' Bh', ' Hindu', ' Kashmir']
Higashikagawa, target: Japan   ==>   predicted: ['�', '\x0b', '\r', '�', 'rawdownload']
Trento, target: Italy   ==>   predicted: ['�', '\x0b', '\r', '\x1c', '�']
Taj Mahal, target: India   ==>   predicted: ['�', '\r', '\x0b', '\x08', '龍�']


In [None]:
(space_needle.bias - menangle.bias).norm()

tensor(269.9868, device='cuda:0')

## Difference with USUAL calculation

In [13]:
final_layer = f"transformer.h.{model.config.n_layer-1}"

In [29]:
prompt = "The Space Needle is located in the city of"

tokenized = tokenizer(prompt, padding=True, return_tensors="pt").to(
    next(model.parameters()).device
)

with nethook.TraceDict(
    model, [final_layer, "transformer.ln_f"]
) as traces:  
    output = model(**tokenized)

In [31]:
get_vocab_representation(traces[final_layer].output[0][0][-1])

[' Seattle', ' Bellev', ' Portland', ' Tacoma', ' Lake']

In [40]:
get_vocab_representation(traces["transformer.ln_f"].output[0][-1], perform_layer_norm= False)

[' Seattle', ' Bellev', ' Portland', ' Tacoma', ' Lake']

In [39]:
logits = output.logits[:, -1, :]
token_ids = logits.topk(dim=-1, k=5).indices.squeeze().tolist()
[tokenizer.decode(t) for t in token_ids]

[' Seattle', ' Bellev', ' Portland', ' Tacoma', ' Lake']

In [48]:
for subject, subject_token_index, target in test_cases:
    objects, z_est = space_needle(
        subject,
        subject_token_index=subject_token_index,
        device=model.device,
        return_top_k=5,
    )
    print(f"{subject}, target: {target}")

    prompt = "{} is located in the country of".format(subject)
    tokenized = tokenizer(prompt, padding=True, return_tensors="pt").to(next(model.parameters()).device)
    with nethook.TraceDict(
        model, [final_layer, "transformer.ln_f"]
    ) as traces:  
        output = model(**tokenized)
    z = traces[final_layer].output[0][0][-1]

    print("z_ = ", get_vocab_representation(z))
    print("z_est = ", get_vocab_representation(z_est))
    print("Distance => ", torch.dist(z, z_est).item())
    print()
    

The Great Wall, target: China
z_ =  [' China', ' the', ' Xin', ' Yun', ' J']
z_est =  [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Distance =>  257.2456970214844

Niagara Falls, target: Canada
z_ =  [' Ontario', ' Canada', ' New', ' Quebec', ' Newfoundland']
z_est =  [' Canada', ' Ontario', ' Niagara', ' New', ' British']
Distance =>  245.4635467529297

Valdemarsvik, target: Sweden
z_ =  [' Sweden', ' Norway', ' Denmark', ' Iceland', ' Finland']
z_est =  [' Iceland', ' Denmark', ' Sweden', ' Finland', ' Norway']
Distance =>  251.4376220703125

Kyoto University, target: Japan
z_ =  [' Japan', ' Kyoto', ' the', ' South', ' Okinawa']
z_est =  [' Japan', ' Japanese', ' Finland', ' Hawaii', ' Tokyo']
Distance =>  236.29151916503906

Hattfjelldal, target: Norway
z_ =  [' Norway', ' Iceland', ' Denmark', ' Sweden', ' Finland']
z_est =  [' Iceland', ' Denmark', ' Norway', ' Sweden', ' Finland']
Distance =>  270.375732421875

Ginza, target: Japan
z_ =  [' Japan', ' Tokyo', ' Gin', ' the