In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm.auto import tqdm
import random
import transformers

import os
import sys
sys.path.append('..')

from relations import estimate
from util import model_utils
from baukit import nethook
from operator import itemgetter

In [12]:
MODEL_NAME = "facebook/galactica-6.7b" # "EleutherAI/gpt-j-6B"
n_embd_field = "hidden_size"

mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float16)

model = mt.model
tokenizer = mt.tokenizer
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

print(f"{MODEL_NAME} ==> device: {model.device}, memory: {model.get_memory_footprint()}")

facebook/galactica-6.7b ==> device: cuda:0, memory: 13314719744


In [13]:
presidents = []
with open("data/list_of_potus.txt") as f:
    lines = f.readlines()
    presidents = [p.strip() for p in lines[1:len(lines):3]]

presidents

['George Washington',
 'John Adams',
 'Thomas Jefferson',
 'James Madison',
 'James Monroe',
 'John Quincy Adams',
 'Andrew Jackson',
 'Martin Van Buren',
 'William Henry Harrison',
 'John Tyler',
 'James K. Polk',
 'Zachary Taylor',
 'Millard Fillmore',
 'Franklin Pierce',
 'James Buchanan',
 'Abraham Lincoln',
 'Andrew Johnson',
 'Ulysses S. Grant',
 'Rutherford B. Hayes',
 'James Garfield',
 'Chester A. Arthur',
 'Grover Cleveland',
 'Benjamin Harrison',
 'Grover Cleveland',
 'William McKinley',
 'Theodore Roosevelt',
 'William Howard Taft',
 'Woodrow Wilson',
 'Warren G. Harding',
 'Calvin Coolidge',
 'Herbert Hoover',
 'Franklin D. Roosevelt',
 'Harry S. Truman',
 'Dwight D. Eisenhower',
 'John F. Kennedy',
 'Lyndon B. Johnson',
 'Richard M. Nixon',
 'Gerald R. Ford',
 'James Carter',
 'Ronald Reagan',
 'George H. W. Bush',
 'William J. Clinton',
 'George W. Bush',
 'Barack Obama',
 'Donald Trump',
 'Joe Biden']

In [14]:
# relation_prompt = "was succeeded by"
# prev_presidents = random.sample(range(len(presidents)-1), k = 3)

# icl_prompt = ""
# for p in prev_presidents:
#     icl_prompt += f"{presidents[p]} {relation_prompt} {presidents[p+1]}\n"
# icl_prompt += "{} " + relation_prompt
# print(icl_prompt)

icl_prompt = """Richard M. Nixon was succeeded by Gerald R. Ford
Woodrow Wilson was succeeded by Warren G. Harding
James Carter was succeeded by Ronald Reagan
{} was succeeded by"""

In [15]:
filter_by_model_knowledge = []
for prev, nxt in zip(presidents[:-1], presidents[1:]):
    txt, ret_dict = model_utils.generate_fast(
        model, tokenizer, 
        prompts=[icl_prompt.format(prev)], max_new_tokens=10, 
        get_answer_tokens=True, argmax_greedy=True
    )

    tick = nxt.startswith(ret_dict['answer'][0]['top_token'].strip())
    print(f"{prev} >> {nxt} ===> {[(ans['token'], ans['p']) for ans in ret_dict['answer'][0]['candidates']]} :: {tick}")
    if(tick):
        filter_by_model_knowledge.append((prev, nxt))


George Washington >> John Adams ===> [(' Abraham', 0.4851), (' Theod', 0.1306), (' Thomas', 0.0683), (' John', 0.0371), (' Dw', 0.0312)] :: False
John Adams >> Thomas Jefferson ===> [(' Ly', 0.1379), (' John', 0.1041), (' Harry', 0.0798), (' Theod', 0.0786), (' Franklin', 0.0773)] :: False
Thomas Jefferson >> James Madison ===> [(' James', 0.1458), (' Theod', 0.1277), (' George', 0.0986), (' John', 0.0986), (' Abraham', 0.0898)] :: True
James Madison >> James Monroe ===> [(' Abraham', 0.1854), (' Theod', 0.1586), (' John', 0.0977), (' Wood', 0.0732), (' William', 0.0693)] :: False
James Monroe >> John Quincy Adams ===> [(' Theod', 0.2141), (' Harry', 0.1309), (' William', 0.1219), (' Herbert', 0.106), (' Cal', 0.0459)] :: False
John Quincy Adams >> Andrew Jackson ===> [(' Theod', 0.1938), (' William', 0.1261), (' Franklin', 0.0967), (' Herbert', 0.0629), (' Ly', 0.062)] :: False
Andrew Jackson >> Martin Van Buren ===> [(' Bill', 0.2067), (' Ly', 0.1177), (' John', 0.0714), (' Jimmy', 0

In [16]:
len(filter_by_model_knowledge)

15

In [12]:
objects = [" " + o[1] for o in filter_by_model_knowledge]

from relations.corner import CornerEstimator
corner_estimator = CornerEstimator(
    model=model, tokenizer=tokenizer,
    ln_f_name= "model.decoder.final_layer_norm", 
    unembedder_module_name="lm_head"
)

In [15]:
simple_corner = corner_estimator.estimate_simple_corner(objects, scale_up=70)
print(simple_corner.norm().item(), corner_estimator.get_vocab_representation(simple_corner, get_logits=True))

28.171875 [(' bird', 39.188), (' plant', 37.25), (' fish', 35.812), (' food', 33.125), (' game', 32.438)]


In [16]:
lin_inv_corner = corner_estimator.estimate_lin_inv_corner(objects, target_logit_value=50)
print(lin_inv_corner.norm().item(), corner_estimator.get_vocab_representation(lin_inv_corner, get_logits=True))

calculating inverse of unbedding weights . . .
18.265625 [(' plant', 23.969), (' bird', 22.859), (' game', 21.344), (' person', 20.828), (' drug', 20.594)]


In [17]:
lst_sq_corner = corner_estimator.estimate_corner_lstsq_solve(objects, target_logit=50)
print(lst_sq_corner.norm().item(), corner_estimator.get_vocab_representation(lst_sq_corner, get_logits=True))

131.875 [(' galaxy', 24.406), (' material', 24.266), (' group', 24.266), (' star', 24.266), (' human', 24.266)]


In [18]:
# avg_corner = corner_estimator.estimate_average_corner_with_gradient_descent(objects, average_on=5, target_logit_value=50, verbose=False)
# print(avg_corner.norm().item(), corner_estimator.get_vocab_representation(avg_corner))

In [19]:
def check_with_test_cases(relation_operator):
    test_cases = [
        (b, -1, h) for b, h in filter_by_model_knowledge[20:]
    ]
    for subject, subject_token_index, target in test_cases:
        answer = relation_operator(
            subject,
            subject_token_index=subject_token_index,
            device=model.device,
            return_top_k=5,
        )
        print(f"{subject}, target: {target}   ==>   predicted: {answer}")

In [21]:
relation = estimate.RelationOperator(
    model = model,
    tokenizer = tokenizer,
    relation = prompt,
    layer = 15,
    weight = torch.eye(getattr(model.config, n_embd_field)).to(model.dtype).to(model.device),
    bias = lst_sq_corner,

    layer_name_format = "model.decoder.layers.{}",
    ln_f_name = "model.decoder.final_layer_norm"
)
check_with_test_cases(relation)

summer, target: season   ==>   predicted: [' season', ' group', ' color', ' wind', ' plant']
meat, target: food   ==>   predicted: [' science', ' group', ' color', ' plant', ' food']
doll, target: toy   ==>   predicted: [' tree', ' shape', ' fish', ' star', ' group']
gold, target: metal   ==>   predicted: [' metal', ' tree', ' wind', ' star', ' color']
round, target: shape   ==>   predicted: [' shape', ' plant', ' wind', ' food', ' color']
breeze, target: wind   ==>   predicted: [' color', ' tree', ' metal', ' plant', ' season']
man, target: human   ==>   predicted: [' group', ' person', ' plant', ' color', ' food']
hologram, target: picture   ==>   predicted: [' metal', ' color', ' device', ' tree', ' plant']
paper, target: material   ==>   predicted: [' science', ' plant', ' wind', ' material', ' group']
photographer, target: person   ==>   predicted: [' group', ' fish', ' tree', ' game', ' drug']
documentary, target: film   ==>   predicted: [' film', ' material', ' science', ' group

In [22]:
def get_averaged_JB(top_performers, relation_prompt, num_icl = 3, calculate_at_lnf = False):
    try:
        jbs = []
        for s, s_idx, o in tqdm(top_performers):
            others = set(top_performers) - {(s, s_idx, o)}
            others = random.sample(list(others), k = min(num_icl, len(list(others)))) 
            prompt = ""
            prompt += "\n".join(relation_prompt.format(s_other) + f" {o_other}." for s_other, idx_other, o_other in others) + "\n"
            prompt += relation_prompt
            print("subject: ", s)
            print(prompt)

            jb, _ = estimate.relation_operator_from_sample(
                model, tokenizer,
                s, prompt,
                subject_token_index= s_idx,
                layer = 15,
                device = model.device,
                # calculate_at_lnf = calculate_at_lnf

                layer_name_format = "model.decoder.layers.{}",
                ln_f_name = "model.decoder.final_layer_norm",
                n_layer_field = "num_hidden_layers"
            )
            print(jb.weight.norm(), jb.bias.norm())
            print()
            jbs.append(jb)
        
        weight = torch.stack([jb.weight for jb in jbs]).mean(dim=0)
        bias  = torch.stack([jb.bias for jb in jbs]).mean(dim=0)

        return weight, bias
    except RuntimeError as e:
        if(str(e).startswith("CUDA out of memory")):
            print("CUDA out of memory")
        if(num_icl > 1):
            num_icl -= 1
            print("trying with smaller icl >> ", num_icl)
            return get_averaged_JB(top_performers, relation_prompt, num_icl, calculate_at_lnf)
        else:
            raise Exception("RuntimeError >> can't calculate Jacobian with minimum number of icl examples")

def get_multiple_averaged_JB(top_performers, relation_prompt, N = 3, num_icl = 2, calculate_at_lnf = False):
    weights_and_biases = []
    sample_size = min(len(top_performers), num_icl + 2)
    for _ in tqdm(range(N)):
        cur_sample = random.sample(top_performers, k = sample_size)
        weight, bias = get_averaged_JB(cur_sample, relation_prompt, num_icl, calculate_at_lnf)
        weights_and_biases.append({
            'weight': weight,
            'bias'  : bias
        })
    return weights_and_biases

In [23]:
samples = [
        (b, -1, h) for b, h in filter_by_model_knowledge[:20]
    ]
print(samples)

weights_and_biases = get_multiple_averaged_JB(
    samples, 
    relation_prompt=" {} is a", 
    N = 3, 
    calculate_at_lnf=False
)

[('oak', -1, 'tree'), ('diamond', -1, 'gem'), ('happiness', -1, 'feeling'), ('family', -1, 'group'), ('thesaurus', -1, 'dictionary'), ('crow', -1, 'bird'), ('tennis', -1, 'sport'), ('salmon', -1, 'fish'), ('flower', -1, 'plant'), ('rosemary', -1, 'herb'), ('cucumber', -1, 'vegetable'), ('roulette', -1, 'game'), ('physics', -1, 'science'), ('earth', -1, 'planet'), ('sun', -1, 'star'), ('coffee', -1, 'beverage'), ('car', -1, 'vehicle'), ('yellow', -1, 'color'), ('fan', -1, 'device'), ('judaism', -1, 'religion')]


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

subject:  cucumber
 family is a group.
 roulette is a game.
 {} is a
tensor(43.0938, device='cuda:0', dtype=torch.float16) tensor(248.8750, device='cuda:0', dtype=torch.float16)

subject:  roulette
 coffee is a beverage.
 cucumber is a vegetable.
 {} is a
tensor(57.5938, device='cuda:0', dtype=torch.float16) tensor(274.2500, device='cuda:0', dtype=torch.float16)

subject:  family
 roulette is a game.
 coffee is a beverage.
 {} is a
tensor(46.7500, device='cuda:0', dtype=torch.float16) tensor(253.7500, device='cuda:0', dtype=torch.float16)

subject:  coffee
 roulette is a game.
 cucumber is a vegetable.
 {} is a
tensor(40.3438, device='cuda:0', dtype=torch.float16) tensor(264., device='cuda:0', dtype=torch.float16)



  0%|          | 0/4 [00:00<?, ?it/s]

subject:  happiness
 roulette is a game.
 yellow is a color.
 {} is a
tensor(38., device='cuda:0', dtype=torch.float16) tensor(259.7500, device='cuda:0', dtype=torch.float16)

subject:  roulette
 happiness is a feeling.
 yellow is a color.
 {} is a
tensor(53.2812, device='cuda:0', dtype=torch.float16) tensor(281.2500, device='cuda:0', dtype=torch.float16)

subject:  yellow
 fan is a device.
 roulette is a game.
 {} is a
tensor(43.8750, device='cuda:0', dtype=torch.float16) tensor(227.2500, device='cuda:0', dtype=torch.float16)

subject:  fan
 roulette is a game.
 happiness is a feeling.
 {} is a
tensor(52.8750, device='cuda:0', dtype=torch.float16) tensor(256.2500, device='cuda:0', dtype=torch.float16)



  0%|          | 0/4 [00:00<?, ?it/s]

subject:  happiness
 cucumber is a vegetable.
 diamond is a gem.
 {} is a
tensor(40.4375, device='cuda:0', dtype=torch.float16) tensor(250.3750, device='cuda:0', dtype=torch.float16)

subject:  diamond
 earth is a planet.
 happiness is a feeling.
 {} is a
tensor(45.1875, device='cuda:0', dtype=torch.float16) tensor(254.3750, device='cuda:0', dtype=torch.float16)

subject:  cucumber
 earth is a planet.
 happiness is a feeling.
 {} is a
tensor(40.7812, device='cuda:0', dtype=torch.float16) tensor(249.6250, device='cuda:0', dtype=torch.float16)

subject:  earth
 cucumber is a vegetable.
 diamond is a gem.
 {} is a
tensor(49.3125, device='cuda:0', dtype=torch.float16) tensor(235.6250, device='cuda:0', dtype=torch.float16)



In [24]:
relation_operator = estimate.RelationOperator(
    model = model,
    tokenizer= tokenizer,
    relation = prompt,
    layer = 15,
    weight = torch.stack(
        [wb['weight'] for wb in weights_and_biases]
    ).mean(dim=0),
    # bias = torch.stack(
    #     [wb['bias'] for wb in weights_and_biases]
    # ).mean(dim=0),
    bias = lst_sq_corner,

    layer_name_format = "model.decoder.layers.{}",
    ln_f_name = "model.decoder.final_layer_norm",
)

check_with_test_cases(relation_operator)

summer, target: season   ==>   predicted: [' season', ' color', ' wind', ' herb', ' plant']
meat, target: food   ==>   predicted: [' food', ' dish', ' vegetable', ' material', ' meat']
doll, target: toy   ==>   predicted: [' toy', ' shape', ' picture', ' bird', ' person']
gold, target: metal   ==>   predicted: [' metal', ' material', ' gem', ' color', ' religion']
round, target: shape   ==>   predicted: [' shape', ' sport', ' season', ' wind', ' galaxy']
breeze, target: wind   ==>   predicted: [' wind', ' season', ' feeling', ' herb', ' color']
man, target: human   ==>   predicted: [' human', ' person', ' vehicle', ' fish', ' sport']
hologram, target: picture   ==>   predicted: [' device', ' picture', ' material', ' science', ' film']
paper, target: material   ==>   predicted: [' material', ' wind', ' dish', ' dictionary', ' vehicle']
photographer, target: person   ==>   predicted: [' person', ' science', ' star', ' human', ' sport']
documentary, target: film   ==>   predicted: [' film

In [25]:
corner_estimator.get_vocab_representation(
    torch.stack(
        [wb['bias'] for wb in weights_and_biases]
    ).mean(dim=0)
)

[' type', ' kind', ' good', ' ', ' very']