In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm import tqdm
import random

import os
import sys
import copy
sys.path.append('..')

from relations import estimate
from util import model_utils
import baukit
import transformers

## Load Model

In [3]:
######################################################################################################################
MODEL_NAME = "gpt2-xl" # options gpt2-{} | "EleutherAI/gpt-neox-20b" | "EleutherAI/gpt-j-6B"

# layer_name_format = "gpt_neox.layers.{}"
# final_layer_norm = "gpt_neox.final_layer_norm"
# unembed = "embed_out"
# num_layer_field = "num_hidden_layers"

layer_name_format = "transformer.h.{}"
final_layer_norm = "transformer.ln_f"
unembed = "lm_head"
num_layer_field = "n_layer"
######################################################################################################################



mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float16)

model = mt.model
tokenizer = mt.tokenizer
tokenizer.pad_token = tokenizer.eos_token

print(f"{MODEL_NAME} ==> device: {model.device}, memory: {model.get_memory_footprint()}")

gpt2-xl ==> device: cuda:0, memory: 3215885792


## Load the later part of the model

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# path_name = "gpt2-medium__last_9_layers"
path_name = "gpt2-xl__last_25_layers"
# path_name = "EleutherAI/gpt-neox-20b__last_21_layers"

part_model = AutoModelForCausalLM.from_pretrained(
                path_name, low_cpu_mem_usage=True, torch_dtype=torch.float16
            )
part_model = part_model.eval().cuda('cuda:1')

part_layer_names = [layer_name_format.format(idx) for idx in range(getattr(part_model.config, num_layer_field))]
print(f"last {getattr(part_model.config, num_layer_field)} layers ==> device: {part_model.device}, memory: {part_model.get_memory_footprint()}")

last 25 layers ==> device: cuda:1, memory: 1753574450


In [5]:
#################################################################
break_layer_idx = getattr(model.config, num_layer_field) - getattr(part_model.config, num_layer_field)
#################################################################
break_layer_idx

23

### Setting `requires_grad=True`, we will need the grads to calculate the gradients later

In [6]:
def check_valid(module_name, prefix = "transformer.h", start_layer = 1):
    if(module_name in [final_layer_norm, unembed]):
        return True
    for idx in range(start_layer, mt.num_layers):
        if(module_name.startswith(f"{prefix}.{idx}")):
            return True
    return False

need_gradients = {
    n: p
    for n, p in part_model.named_parameters()
    if check_valid(
        n, prefix = layer_name_format[:-3]
    )
}

for n, w in part_model.named_parameters():
    if(n in need_gradients):
        w.requires_grad = True
    else:   
        w.requires_grad = False

## Checking Equivalence

In [7]:
prompt = ["The Space Needle is located in the country of"]

tokenized_inputs = tokenizer(
    prompt,
    padding = True,
    return_tensors="pt"
).to(next(model.parameters()).device)

break_layer_name = layer_name_format.format(break_layer_idx)
z_layer_name = layer_name_format.format(mt.num_layers-1)

with baukit.TraceDict(
    model, 
    mt.layer_names, # [h_layer_name, z_layer_name], 
    retain_input=True
) as traces:
    outputs = model(**tokenized_inputs)

In [8]:
def replace_first_layer_output(target):
    first_layer = layer_name_format.format(0)
    def edit_policy(output, layer_name):
        if(layer_name != first_layer):
            return output
        print(layer_name, " << original", break_layer_name)
        output[0][...] = target[0].to(part_model.device)
        output[1][0][...] = target[1][0].to(part_model.device)
        output[1][1][...] = target[1][1].to(part_model.device)
        return output

    return edit_policy

def untuple(x):
    if(type(x) is tuple):
        return x[0]
    return x

In [9]:
with baukit.TraceDict(
    part_model, 
    part_layer_names,
    retain_input=True,
    edit_output = replace_first_layer_output(
        target = traces[break_layer_name].output
    )
) as part_traces:
    part_outputs = part_model(
        input_ids = tokenized_inputs.input_ids.to(part_model.device),
        attention_mask = tokenized_inputs.attention_mask.to(part_model.device)
    )

transformer.h.0  << original transformer.h.23


In [10]:
# check logit difference
torch.dist(outputs.logits, part_outputs.logits.to(model.device))

tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)

In [11]:
logits = part_outputs.logits.to(model.device)
top_k = 5
softmax_out = torch.nn.functional.softmax(logits[:, -1, :], dim=1)

# Top-k sampling
tk = torch.topk(softmax_out, top_k, dim=1).indices
[
    tokenizer.decode(t) for t in tk[0]
]

[' Washington', ' Seattle', ' the', ' Japan', ' Canada']

In [12]:
# Check input difference

for idx in range(break_layer_idx, mt.num_layers):
    orig_input = traces[layer_name_format.format(idx)].input
    cur_input = part_traces[layer_name_format.format(idx - break_layer_idx)].input

    print(torch.dist(orig_input[0], cur_input[0].to(orig_input[0].device)))


tensor(4000., device='cuda:0', dtype=torch.float16)
tensor(0., device='cuda:0', dtype=torch.float16)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)
tensor(0., 

In [13]:
# Check output difference

for idx in range(break_layer_idx, mt.num_layers):

    original_layer = layer_name_format.format(idx)
    target_layer = layer_name_format.format(idx - break_layer_idx)
    print(original_layer, target_layer)

    orig_output = traces[original_layer].output
    cur_output = part_traces[target_layer].output

    print(
        torch.dist(orig_output[0], cur_output[0].to(orig_output[0].device)),
        torch.dist(orig_output[1][0], cur_output[1][0].to(orig_output[0].device)),
        torch.dist(orig_output[1][1], cur_output[1][1].to(orig_output[0].device))
    )
    print()

transformer.h.23 transformer.h.0
tensor(0., device='cuda:0', dtype=torch.float16) tensor(0., device='cuda:0', dtype=torch.float16) tensor(0., device='cuda:0', dtype=torch.float16)

transformer.h.24 transformer.h.1
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)

transformer.h.25 transformer.h.2
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)

transformer.h.26 transformer.h.3
tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>) tensor(0., device='cuda:0', dtype=torch.float16, grad_fn=<DistBackward0>)

transformer.h.27 transformer.h.4
tensor(0., device=

## Calculate Jacobians

### On a single GPU (<span style="color:red">! will get memory exceed error with `NeoX` !</span>)

In [14]:
h_token_index = 3
calculate_at_lnf = False
consider_residual = False

##################################
h_layer_idx = 27
##################################
h_layer_name = layer_name_format.format(h_layer_idx)
z_layer_name = mt.layer_names[-1]

h = traces[h_layer_name].output[0][0, h_token_index]
z = traces[z_layer_name].output[0][0, -1]

def compute_z_from_h(h: torch.Tensor) -> torch.Tensor:
    def insert_h(output: tuple, layer: str) -> tuple:
        if layer != h_layer_name:
            return output
        # print((output[0][0, h_token_index] - h).norm())
        output[0][0, h_token_index] = h
        return output

    with baukit.TraceDict(
        model, (h_layer_name, z_layer_name), edit_output=insert_h
    ) as ret:
        model(**tokenized_inputs)
    # print(z_layer_name, ret[z_layer_name].output[0][-1].shape)
    if(calculate_at_lnf == False):
        f_h = ret[z_layer_name].output[0][0, -1]
    else:
        f_h = ret[z_layer_name].output[0][-1]
    return f_h - h if consider_residual == True else f_h


weight = torch.autograd.functional.jacobian(compute_z_from_h, h, vectorize=True)

### Using two GPUs (<span style="color:green">! You will want to do this while working with `NeoX` !</span>)

In [15]:
h_layer_idx = 27
shifted__h_layer_idx = h_layer_idx - (len(mt.layer_names) - len(part_layer_names))
shifted__h_layer_idx

4

In [16]:
h_token_index = 3
calculate_at_lnf = False
consider_residual = False

first_layer = layer_name_format.format(0)
shifted__h_layer_name = layer_name_format.format(shifted__h_layer_idx)
shifted__z_layer_name = part_layer_names[-1]
h = part_traces[shifted__h_layer_name].output[0][0, h_token_index]
z = part_traces[shifted__z_layer_name].output[0][0, -1]

def compute_z_from_h(h: torch.Tensor) -> torch.Tensor:
    def replace_first_layer_output__and_insert_h(target):
        def edit_policy(output, layer_name):
            if(layer_name == first_layer):
                print(layer_name, " << original", break_layer_name)
                output[0][...] = target[0].to(part_model.device)
                output[1][0][...] = target[1][0].to(part_model.device)
                output[1][1][...] = target[1][1].to(part_model.device)
            if(layer_name == shifted__h_layer_name):
                print(f"replacing {shifted__h_layer_name} outputs")
                output[0][0, h_token_index] = h
            return output
        return edit_policy

    with baukit.TraceDict(
        part_model, 
        (first_layer, shifted__h_layer_name, shifted__z_layer_name),
        edit_output = replace_first_layer_output__and_insert_h(
            target = traces[break_layer_name].output
        )
    ) as ret:
        part_model(
            input_ids = tokenized_inputs.input_ids.to(part_model.device),
            attention_mask = tokenized_inputs.attention_mask.to(part_model.device)
        )
    if(calculate_at_lnf == False):
        f_h = ret[shifted__z_layer_name].output[0][0, -1]
    else:
        f_h = ret[shifted__z_layer_name].output[0][-1]

    return f_h - h if consider_residual == True else f_h

In [17]:
def calculate_jacobian(function, h):
    h.retain_grad()
    z_est = function(h)
    jacobian = []
    print("Calculating Jacobians ...")
    for idx in tqdm(range(h.shape[0])):
        part_model.zero_grad()
        z_est[idx].backward(retain_graph=True)
        jacobian.append(copy.deepcopy(h.grad))
        h.grad.zero_()
    return torch.stack(jacobian)

h = part_traces[shifted__h_layer_name].output[0][0, h_token_index]
J = calculate_jacobian(compute_z_from_h, h)

transformer.h.0  << original transformer.h.23
replacing transformer.h.4 outputs
Calculating Jacobians ...


100%|██████████| 1600/1600 [00:48<00:00, 32.91it/s]


In [18]:
weight.shape, J.shape

(torch.Size([1600, 1600]), torch.Size([1600, 1600]))

#### There are some slight difference when the Jacobian weights are calculated using dual GPU!

In [19]:
torch.dist(weight, J.to(model.device))

tensor(0.0339, device='cuda:0', dtype=torch.float16)

In [20]:
torch.dist(weight[17], J[17].to(model.device))

tensor(0.0008, device='cuda:0', dtype=torch.float16)

## Checking the Relation Operator

In [21]:
test_cases = [
    ("The Space Needle", -1, "United States"),
    ("The Great Wall", -1, "China"),
    ("Niagara Falls", -2, "Canada"),
    ("Valdemarsvik", -1, "Sweden"),
    ("Kyoto University", -2, "Japan"),
    ("Hattfjelldal", -1, "Norway"),
    ("Ginza", -1, "Japan"),
    ("Sydney Hospital", -2, "Australia"),
    ("Mahalangur Himal", -1, "Nepal"),
    ("Higashikagawa", -1, "Japan"),
    ("Trento", -1, "Italy"),
    ("Taj Mahal", -1, "India")
]

def evaluate_against_test_cases(relation):
    for subject, subject_token_index, target in test_cases:
        objects = relation(
            subject,
            subject_token_index=subject_token_index,
            device=model.device,
            return_top_k=5,
        )
        print(f"{subject}, target: {target}   ==>   predicted: {objects}")

### <span style="color:red"> will not work with `NeoX` </span>

In [32]:
space_needle_1 = estimate.estimate_relation_operator(
    model, tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=h_layer_idx,
    device= model.device
)

In [33]:
evaluate_against_test_cases(space_needle_1)

The Space Needle, target: United States   ==>   predicted: [' Washington', ' Seattle', ' the', ' Japan', ' Canada']
The Great Wall, target: China   ==>   predicted: [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Niagara Falls, target: Canada   ==>   predicted: [' Canada', ' Niagara', ' Ontario', ' New', ' Newfoundland']
Valdemarsvik, target: Sweden   ==>   predicted: [' Iceland', ' Norway', ' Sweden', ' Finland', ' Alaska']
Kyoto University, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Tokyo', ' Hawaii', ' Hawai']
Hattfjelldal, target: Norway   ==>   predicted: [' Iceland', ' Norway', ' Sweden', ' Denmark', ' Finland']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Tokyo', ' Japanese', ' China', ' Singapore']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Singapore', ' NSW']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Bh', ' Tibet', ' India', ' Nep']
Higashikagawa, target: Japan   ==>   pred

In [34]:
space_needle_2 = estimate.estimate_relation_operator_neox(
    model, part_model,
    tokenizer,
    "The Space Needle",
    "{} is located in the country of",
    layer=h_layer_idx,

    layer_name_format = layer_name_format,
    final_layer_norm = final_layer_norm,
    unembed = unembed,
)

prompt >>  The Space Needle is located in the country of
h_token_idx >>  3
transformer.h.0  << original transformer.h.23
replacing transformer.h.4 outputs
Calculating Jacobians ...


100%|██████████| 1600/1600 [00:43<00:00, 37.04it/s]


In [35]:
evaluate_against_test_cases(space_needle_2)

The Space Needle, target: United States   ==>   predicted: [' Washington', ' Seattle', ' the', ' Japan', ' Canada']
The Great Wall, target: China   ==>   predicted: [' China', ' Hong', ' Beijing', ' Chinese', ' Shen']
Niagara Falls, target: Canada   ==>   predicted: [' Canada', ' Niagara', ' Ontario', ' New', ' Newfoundland']
Valdemarsvik, target: Sweden   ==>   predicted: [' Iceland', ' Norway', ' Sweden', ' Finland', ' Alaska']
Kyoto University, target: Japan   ==>   predicted: [' Japan', ' Japanese', ' Tokyo', ' Hawaii', ' Hawai']
Hattfjelldal, target: Norway   ==>   predicted: [' Iceland', ' Norway', ' Sweden', ' Denmark', ' Finland']
Ginza, target: Japan   ==>   predicted: [' Japan', ' Tokyo', ' Japanese', ' China', ' Singapore']
Sydney Hospital, target: Australia   ==>   predicted: [' Australia', ' Sydney', ' Australian', ' Singapore', ' NSW']
Mahalangur Himal, target: Nepal   ==>   predicted: [' Nepal', ' Bh', ' Tibet', ' India', ' Nep']
Higashikagawa, target: Japan   ==>   pred

In [36]:
torch.dist(space_needle_1.weight, space_needle_2.weight.to(model.device))

tensor(0.0339, device='cuda:0', dtype=torch.float16)

In [44]:
torch.dist(space_needle_1.bias, space_needle_2.bias.to(model.device))

tensor(0.1179, device='cuda:0', dtype=torch.float16)

In [42]:
row_wise_diff = np.array([
    torch.dist(space_needle_1.weight[r].to('cpu'), space_needle_2.weight[r].to('cpu'))
    for r in range(space_needle_1.weight.shape[0])
])

f"Jacobian row-wise difference >> {row_wise_diff.mean()} +/- {row_wise_diff.std()}"

'Jacobian row-wise difference >> 7.987022399902344e-06 +/- 0.0'

#### All the rows in the Jacobian matrices has the same differnece. I don't know if there are any explanation for that.
#### However, it doesn't seem like this difference has any effect on the generated tokens