In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm.auto import tqdm
import random
import transformers

import os
import sys
sys.path.append('..')

from relations import estimate
from util import model_utils
from baukit import nethook
from operator import itemgetter

In [4]:
MODEL_NAME = "EleutherAI/gpt-j-6B" # "facebook/galactica-6.7b"  # gpt2-{medium,large,xl} or EleutherAI/gpt-j-6B
n_embd_field = "hidden_size"
mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32)

model = mt.model
tokenizer = mt.tokenizer
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

print(f"{MODEL_NAME} ==> device: {model.device}, memory: {model.get_memory_footprint()}")

EleutherAI/gpt-j-6B ==> device: cuda:0, memory: 24320971760


In [100]:
import baukit

def untuple(x):
    if(isinstance(x, tuple)):
        return x[0]
    return x

def extract_zh(subject, relation, layer, subject_token_index = -1):
    prompt = relation.format(subject)
    inputs = tokenizer(
        prompt, return_tensors="pt", return_offsets_mapping=True
    ).to(model.device)
    offset_mapping = inputs.pop("offset_mapping")
    subject_i, subject_j = estimate._find_token_range(
        prompt, subject, offset_mapping=offset_mapping[0]
    )

    h_token_index = estimate._determine_token_index(subject_i, subject_j, subject_token_index)
    h_layer_name = f"transformer.h.{layer}"
    z_layer_name = f"transformer.h.{model.config.n_layer-1}"
    with baukit.TraceDict(
        model, [h_layer_name, z_layer_name], retain_input=True
    ) as traces:
        model(**inputs)
    
    ret_dict = {
        "h_token_index": h_token_index
    }

    print(untuple(traces[h_layer_name].input).norm())
    return (
        untuple(traces[h_layer_name].output)[0, h_token_index], 
        untuple(traces[z_layer_name].output)[0, -1]
    ), ret_dict

In [159]:
######################################################################################
# relation = "{} is located in the country of"
# subject_o = "Niagara Falls"
# subject_c = "The Great Wall"
relation = '''superlative of large is largest
 superlative of quick is quickest
 superlative of {} is
'''
subject_o = "tough"
subject_c = "safe"
layer = 27
######################################################################################

prompts = [
    relation.format(subject_o),
    relation.format(subject_c)
]

txt, ret_dict = model_utils.generate_fast(
    model, tokenizer,
    prompts=prompts,
    argmax_greedy=True,
    get_answer_tokens= True,
    max_new_tokens=5
)
txt

['superlative of large is largest  superlative of quick is quickest  superlative of tough is  toughest  superlative',
 'superlative of large is largest  superlative of quick is quickest  superlative of safe is  safest  superlative']

In [160]:
ret_dict["answer"]

[{'top_token': ' toughest',
  'candidates': [{'token': ' toughest', 'token_id': 28212, 'p': 0.4606},
   {'token': ' strongest', 'token_id': 12841, 'p': 0.0774},
   {'token': ' tough', 'token_id': 5802, 'p': 0.0537},
   {'token': ' ', 'token_id': 220, 'p': 0.0513},
   {'token': '\n', 'token_id': 198, 'p': 0.0512}]},
 {'top_token': ' safest',
  'candidates': [{'token': ' safest', 'token_id': 33630, 'p': 0.5988},
   {'token': ' safe', 'token_id': 3338, 'p': 0.0967},
   {'token': ' super', 'token_id': 2208, 'p': 0.0513},
   {'token': '\n', 'token_id': 198, 'p': 0.0273},
   {'token': ' safer', 'token_id': 14178, 'p': 0.027}]}]

In [161]:
tokenizer([" " + subject_o, " " + subject_c], padding = True, return_tensors="pt").to(model.device)

{'input_ids': tensor([[5802],
        [3338]], device='cuda:0'), 'attention_mask': tensor([[1],
        [1]], device='cuda:0')}

In [162]:
(h_o, z_o), ret_dict = extract_zh(
    subject = subject_o, relation=relation,
    layer = layer
)

h_o.shape, z_o.shape, ret_dict

tensor(1965.3038, device='cuda:0')


(torch.Size([4096]), torch.Size([4096]), {'h_token_index': 20})

In [163]:
(h_c, z_c), ret_dict = extract_zh(
    subject = subject_c, relation=relation,
    layer = layer
)

h_c.shape, z_c.shape, ret_dict

tensor(1965.8682, device='cuda:0')


(torch.Size([4096]), torch.Size([4096]), {'h_token_index': 20})

In [164]:
from relations.corner import CornerEstimator

corner_estimator = CornerEstimator(model, tokenizer)

In [165]:
corner_estimator.get_vocab_representation(z_o, get_logits=True)

[(' toughest', 16.008),
 (' strongest', 14.224),
 (' tough', 13.859),
 (' ', 13.813),
 ('\n', 13.811)]

In [166]:
corner_estimator.get_vocab_representation(z_c, get_logits=True)

[(' safest', 16.283),
 (' safe', 14.46),
 (' super', 13.826),
 ('\n', 13.193),
 (' safer', 13.185)]

In [167]:
def replace_latent(h_r, int_layer, int_token):
    def intervention(output, layer):
        print(layer, int_layer)
        if(layer != int_layer):
            return output
        print(int_layer, " >> ", output[0].shape, h_r.shape)
        output[0][0, int_token] = h_r
        return output
    return intervention

In [168]:
prompt = relation.format(subject_c)
tokenized = tokenizer(
    prompt, return_tensors="pt",
).to(model.device)

intervention_layer = f"transformer.h.{layer}"
final_layer = f"transformer.h.{model.config.n_layer - 1}"

with baukit.TraceDict(
    model, [intervention_layer, final_layer], 
    edit_output= replace_latent(h_r=h_o, int_layer = intervention_layer, int_token=20)
) as traces:
    output = model(**tokenized)

transformer.h.27 transformer.h.27
transformer.h.27  >>  torch.Size([1, 23, 4096]) torch.Size([4096])


In [169]:
corner_estimator.get_vocab_representation(untuple(traces[final_layer].output)[0, -1], get_logits=True)

[(' safest', 16.283),
 (' safe', 14.46),
 (' super', 13.826),
 ('\n', 13.193),
 (' safer', 13.185)]