In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
from src import models, data
from tqdm.auto import tqdm
import json
import os
import numpy as np
import copy

In [3]:
device = "cuda:0"
mt = models.load_model("gptj", device=device)
print(
    f"dtype: {mt.model.dtype}, device: {mt.model.device}, memory: {mt.model.get_memory_footprint()}"
)

dtype: torch.float16, device: cuda:0, memory: 12219206136


In [4]:
from src.lens import interpret_logits, logit_lens
from src.functional import untuple

prompt = "Prudential Tower is located in the city of"
tokenized = mt.tokenizer(prompt, return_tensors="pt", padding=True).to(mt.model.device)

import baukit

with baukit.TraceDict(
    mt.model,
    models.determine_layer_paths(mt)
) as traces:
    output = mt.model(**tokenized)
    
interpret_logits(mt, output.logits[0][-1], get_proba=True)

[(' Chicago', 0.51),
 (' Newark', 0.174),
 (' Boston', 0.069),
 (' San', 0.031),
 (' Houston', 0.014),
 (' Minneapolis', 0.011),
 (' Jersey', 0.011),
 (' London', 0.011),
 (' Detroit', 0.01),
 (' Baltimore', 0.009)]

In [5]:
interested_words = [" Seattle", " Paris", " Dhaka"]
int_tokenized = mt.tokenizer(interested_words, return_tensors="pt", padding=True).to(
    mt.model.device
)
int_tokenized.input_ids

z = untuple(traces[models.determine_layer_paths(mt)[-1]].output)[0][-1]
print(z.shape)

logit_lens(mt, z, [t[0] for t in int_tokenized.input_ids], get_proba=True)

torch.Size([4096])


([(' Chicago', 0.514),
  (' Newark', 0.172),
  (' Boston', 0.068),
  (' San', 0.031),
  (' Houston', 0.014),
  (' Minneapolis', 0.011),
  (' Jersey', 0.011),
  (' London', 0.011),
  (' Detroit', 0.01),
  (' Baltimore', 0.009)],
 {tensor(7312, device='cuda:0'): (0.0017547607421875, ' Seattle'),
  tensor(6342, device='cuda:0'): (3.3974647521972656e-06, ' Paris'),
  tensor(20529, device='cuda:0'): (9.1552734375e-05, ' Dh')})

### $F(\mathbf{h_{s}}) = \mathbf{h_{s}}$, set $W_{r} = I$ and $bias = \mathbf{0}$, basically logit lens

In [6]:
from src.operators import LinearRelationOperator

logit_lens_operator = LinearRelationOperator(
    mt = mt, 
    h_layer = -1,
    weight = None, bias = None, # basically logit lens if both weight and bias set to None
    prompt_template="{} is located in the city of",
    z_layer = -1,
)

In [7]:
logit_lens_operator(
    subject = "The Space Needle",
    k = 10,
    h = z
)

LinearRelationOutput(predictions=[PredictedToken(token=' Chicago', prob=0.5139787793159485), PredictedToken(token=' Newark', prob=0.17216132581233978), PredictedToken(token=' Boston', prob=0.06848104298114777), PredictedToken(token=' San', prob=0.030866824090480804), PredictedToken(token=' Houston', prob=0.013804498128592968), PredictedToken(token=' Minneapolis', prob=0.011266903951764107), PredictedToken(token=' Jersey', prob=0.011092226952314377), PredictedToken(token=' London', prob=0.010501908138394356), PredictedToken(token=' Detroit', prob=0.010178797878324986), PredictedToken(token=' Baltimore', prob=0.00850470457226038)], h=tensor([-1.4648,  0.7959, -0.9663,  ..., -0.6025,  0.8594, -4.9844],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>), z=tensor([-1.4648,  0.7959, -0.9663,  ..., -0.6025,  0.8594, -4.9844],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>))

In [8]:
unembedding = baukit.nethook.get_module(mt.model, "lm_head")
unembedding.weight.shape

torch.Size([50400, 4096])

In [9]:
subject = " Chicago"
emb_subject = unembedding.weight[mt.tokenizer(subject).input_ids[0]]
logit_lens_operator(
    subject = "Whatever",
    k = 10,
    h = emb_subject
)

LinearRelationOutput(predictions=[PredictedToken(token=' Chicago', prob=1.0), PredictedToken(token='Chicago', prob=6.665581192860808e-23), PredictedToken(token=' Detroit', prob=3.3952676397964373e-28), PredictedToken(token=' Illinois', prob=2.333531109663677e-28), PredictedToken(token=' Boston', prob=5.542621749889874e-29), PredictedToken(token=' Milwaukee', prob=4.0150617628607155e-30), PredictedToken(token=' Philadelphia', prob=3.1269332749387515e-30), PredictedToken(token=' Seattle', prob=6.157297243929202e-31), PredictedToken(token=' Toronto', prob=3.096083083978257e-31), PredictedToken(token=' Atlanta', prob=1.3738785357297722e-31)], h=tensor([-0.0243, -0.0335, -0.0092,  ..., -0.0075,  0.0104,  0.0171],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>), z=tensor([-0.0243, -0.0335, -0.0092,  ..., -0.0075,  0.0104,  0.0171],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>))

### Loading dataset

In [10]:
from src.data import load_dataset
dataset = load_dataset()
cur_relation = [
    d for d in dataset if d.name == "country capital city"
][0]
train, test = cur_relation.split(size = 10)
len(train.samples), len(test.samples)

(10, 14)

### ICL-Mean, our flagship method

In [11]:
from src.operators import JacobianIclMeanEstimator

mean_estimator = JacobianIclMeanEstimator(
    mt = mt,
    h_layer = 12,
    beta= 0.5
)

icl_mean = mean_estimator(train)

relation has > 1 prompt_templates, will use first (The capital city of {} is)


### Learned Linear Model baseline

In [12]:
from src.operators import LearnedLinearEstimatorBaseline
    
learned_estimator = LearnedLinearEstimatorBaseline(
    mt=mt,
    h_layer=15,
)

learned_operator = learned_estimator(train)

relation has > 1 prompt_templates, will use first (The capital city of {} is)


### Offset Model (a simpler version of the `corner` approach)

In [13]:
from src.operators import OffsetEstimatorBaseline
    
offset_estimator = OffsetEstimatorBaseline(
    mt=mt,
    h_layer=15,
    # scaling_factor=70
)

offset_operator = offset_estimator(train)

relation has > 1 prompt_templates, will use first (The capital city of {} is)


In [14]:
subject = "France"

print(learned_operator(subject).predictions)
print(icl_mean(subject).predictions)
print(offset_operator(subject).predictions)

[PredictedToken(token='...', prob=0.04671017453074455), PredictedToken(token=' Ben', prob=0.03872409835457802), PredictedToken(token='...', prob=0.02312321402132511), PredictedToken(token=' politically', prob=0.012090252712368965), PredictedToken(token=' capital', prob=0.011357740499079227)]
[PredictedToken(token=' Paris', prob=0.9594237208366394), PredictedToken(token=' French', prob=0.028972085565328598), PredictedToken(token=' France', prob=0.008835986256599426), PredictedToken(token=' Franc', prob=0.0010888072429224849), PredictedToken(token='Paris', prob=0.0006203835946507752)]
[PredictedToken(token=' Paris', prob=0.926771879196167), PredictedToken(token=' Moscow', prob=0.032719049602746964), PredictedToken(token=' Berlin', prob=0.012418750673532486), PredictedToken(token=' Tokyo', prob=0.007075990084558725), PredictedToken(token=' London', prob=0.007075990084558725)]


# Loading the `hparams` and calculating results for each baselines

In [33]:
from src import functional, metrics
from src.operators import (
    JacobianEstimator,
    LinearRelationOperator,
    OffsetEstimatorBaseline,
    LearnedLinearEstimatorBaseline,
    JacobianIclMeanEstimator,
)

dataset = load_dataset()

In [89]:
def evaluate(
        operator: LinearRelationOperator, 
        test_set: data.Relation, 
        k: int = 10
)-> dict:
    pred_objects = []
    test_objects = [x.object for x in test_set.samples]
    subject_to_pred = {}
    for sample in test_set.samples:
        preds = operator(subject = sample.subject, k=k)
        pred_objects.append([p.token for p in preds.predictions])
        subject_to_pred[sample.subject] = [p.token for p in preds.predictions]
    return {
        'recall': metrics.recall(pred_objects, test_objects),
        'predictions': subject_to_pred
    }

In [90]:
hparams_path = "../hparams/gptj"
save_dir = "../results/faithfulness_baselines"
N_TRIALS = 3
N_TRAINING = functional.DEFAULT_N_ICL_LM

os.makedirs(save_dir, exist_ok=True)

In [98]:
all_results = []

for relation_hparams in os.listdir(hparams_path):
    with open(os.path.join(hparams_path, relation_hparams), "r") as f:
        hparams = json.load(f)
    if(hparams['relation_name'] != "country capital city"):
        continue
    print(f"{hparams['relation_name']} | h_layer: {hparams['h_layer']} | beta: {hparams['beta']}")
    result = {
        "relation_name": hparams["relation_name"],
        "h_layer": hparams["h_layer"],
        "beta": hparams["beta"],
    }
    cur_relation = dataset.filter(
        relation_names=[hparams["relation_name"]],
    )
    cur_relation_known = functional.filter_dataset_samples(mt=mt, dataset=cur_relation)
    cur_relation = cur_relation[0]
    cur_relation_known = cur_relation_known[0]

    print(f"known samples: {len(cur_relation_known.samples)}/{len(cur_relation.samples)}")
    result["known_samples"] = len(cur_relation_known.samples)
    result["total_samples"] = len(cur_relation.samples)
    result["trials"] = []

    prompt_template = cur_relation_known.prompt_templates[0]
    print(f"prompt template: {prompt_template}")
    result["prompt_template"] = prompt_template
    print()

    for trial in range(N_TRIALS):
        print(f"trial {trial + 1}/{N_TRIALS}")
        train, test = cur_relation_known.split(size=N_TRAINING)
        print(f"train: {[str(sample) for sample in train.samples]}")

        icl_prompt = functional.make_prompt(
            mt=mt, 
            prompt_template=prompt_template, 
            examples=train.samples, 
            subject="{}"
        )

        trial_results = {
            "icl_prompt": icl_prompt,
            "train": [{
                "subject": sample.subject,
                "object": sample.object,
            } for sample in train.samples],
            "zero_shot": [],        # W_r and b_r calculated without any ICL examples
            "logit_lens": [],       # F(h) = h 
            "corner": [],           # F(h) = h + b
            "learned_linear": [],   # F(h) = Wh + b, W is learned with linear regression
            "icl_mean_emb": [],     # ICL-Mean but h set to embedding
            "icl_mean": [],         # flagship method
        }
        
        zero_shot_estimator = JacobianEstimator(
            mt=mt,
            h_layer=hparams["h_layer"],
            beta=hparams["beta"],
        )
        zero_shot_operator = zero_shot_estimator.estimate_for_subject(
            subject=train.samples[0].subject,
            prompt_template=prompt_template,
        )
        zero_shot_recall = evaluate(zero_shot_operator, test)
        print(f"zero shot recall: {zero_shot_recall['recall']}")
        trial_results["zero_shot"] = zero_shot_recall
        
        logit_lens_operator = LinearRelationOperator(
            mt = mt, 
            h_layer = hparams["h_layer"],
            weight = None, bias = None,
            prompt_template=icl_prompt,
            z_layer = -1,
        )
        logit_lens_recall = evaluate(logit_lens_operator, test)
        print(f"logit lens recall: {logit_lens_recall['recall']}")
        trial_results["logit_lens"] = logit_lens_recall

        offset_estimator = OffsetEstimatorBaseline(
            mt=mt,
            h_layer=hparams["h_layer"],
        )
        offset_operator = offset_estimator(train)
        offset_recall = evaluate(offset_operator, test)
        print(f"offset recall: {offset_recall['recall']}")
        trial_results["corner"] = offset_recall

        learned_estimator = LearnedLinearEstimatorBaseline(
            mt=mt,
            h_layer=hparams["h_layer"],
        )
        learned_operator = learned_estimator(train)
        learned_recall = evaluate(learned_operator, test)
        print(f"learned recall: {learned_recall['recall']}")
        trial_results["learned_linear"] = learned_recall

        mean_emb_estimator = JacobianIclMeanEstimator(
            mt=mt,
            h_layer = "emb",
            beta=hparams["beta"],
        )
        mean_emb_operator = mean_emb_estimator(train)
        mean_emb_recall = evaluate(mean_emb_operator, test)
        print(f"icl mean recall (emb): {mean_emb_recall['recall']}")
        trial_results["icl_mean_emb"] = mean_emb_recall

        mean_estimator = JacobianIclMeanEstimator(
            mt=mt,
            h_layer=hparams["h_layer"],
            beta=hparams["beta"],
        )
        mean_operator = mean_estimator(train)
        mean_recall = evaluate(mean_operator, test)
        print(f"icl mean recall: {mean_recall['recall']}")
        trial_results["icl_mean"] = mean_recall

        result["trials"].append(trial_results)
        print()

    all_results.append(result)
    print("-----------------------------------------------------------------------")
    print("\n\n")

    with open(f"{save_dir}/gptj.json", "w") as f:
        json.dump(all_results, f, indent=4)
    
    # break

country capital city | h_layer: 6 | beta: 0.38333334028720856


filter dataset:   0%|          | 0/1 [00:00<?, ?it/s]

known samples: 24/24
prompt template: The capital city of {} is

trial 1/3
train: ['Nigeria -> Abuja', 'Venezuela -> Caracas', 'Australia -> Canberra', 'Mexico -> Mexico City', 'Chile -> Santiago']
zero shot recall: [0.2631578947368421, 0.42105263157894735, 0.47368421052631576, 0.5263157894736842, 0.5263157894736842, 0.5263157894736842, 0.5263157894736842, 0.5263157894736842, 0.5263157894736842, 0.5263157894736842]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


logit lens recall: [0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.10526315789473684]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


offset recall: [0.05263157894736842, 0.15789473684210525, 0.15789473684210525, 0.21052631578947367, 0.3157894736842105, 0.3684210526315789, 0.3684210526315789, 0.3684210526315789, 0.47368421052631576, 0.5263157894736842]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


learned recall: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


icl mean recall (emb): [0.8421052631578947, 0.8421052631578947, 0.8421052631578947, 0.8421052631578947, 0.8421052631578947, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632]
icl mean recall: [0.9473684210526315, 0.9473684210526315, 0.9473684210526315, 0.9473684210526315, 0.9473684210526315, 1.0, 1.0, 1.0, 1.0, 1.0]

trial 2/3
train: ['Spain -> Madrid', 'Brazil -> Bras\\u00edlia', 'Turkey -> Ankara', 'South Korea -> Seoul', 'United States -> Washington D.C.']
zero shot recall: [0.3684210526315789, 0.47368421052631576, 0.47368421052631576, 0.47368421052631576, 0.47368421052631576, 0.5789473684210527, 0.5789473684210527, 0.5789473684210527, 0.5789473684210527, 0.5789473684210527]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


logit lens recall: [0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.10526315789473684]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


offset recall: [0.05263157894736842, 0.10526315789473684, 0.10526315789473684, 0.15789473684210525, 0.21052631578947367, 0.2631578947368421, 0.2631578947368421, 0.2631578947368421, 0.3684210526315789, 0.3684210526315789]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


learned recall: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842, 0.05263157894736842, 0.10526315789473684, 0.15789473684210525]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


icl mean recall (emb): [0.21052631578947367, 0.2631578947368421, 0.3684210526315789, 0.3684210526315789, 0.5789473684210527, 0.631578947368421, 0.631578947368421, 0.631578947368421, 0.6842105263157895, 0.6842105263157895]
icl mean recall: [0.8421052631578947, 0.8947368421052632, 0.9473684210526315, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

trial 3/3
train: ['South Korea -> Seoul', 'Germany -> Berlin', 'United States -> Washington D.C.', 'Spain -> Madrid', 'Brazil -> Bras\\u00edlia']
zero shot recall: [0.21052631578947367, 0.2631578947368421, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105, 0.3157894736842105]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


logit lens recall: [0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


offset recall: [0.05263157894736842, 0.10526315789473684, 0.10526315789473684, 0.10526315789473684, 0.15789473684210525, 0.21052631578947367, 0.21052631578947367, 0.21052631578947367, 0.3157894736842105, 0.3157894736842105]


relation has > 1 prompt_templates, will use first (The capital city of {} is)


learned recall: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842, 0.10526315789473684]


KeyboardInterrupt: 