In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
from src import models, data
from tqdm.auto import tqdm
import json
import os
import numpy as np
import copy

In [None]:
device = "cuda:0"
mt = models.load_model("gptj", device=device)
print(
    f"dtype: {mt.model.dtype}, device: {mt.model.device}, memory: {mt.model.get_memory_footprint()}"
)

In [None]:
from src.lens import interpret_logits, logit_lens
from src.functional import untuple

prompt = "Prudential Tower is located in the city of"
tokenized = mt.tokenizer(prompt, return_tensors="pt", padding=True).to(mt.model.device)

import baukit

with baukit.TraceDict(
    mt.model,
    models.determine_layer_paths(mt)
) as traces:
    output = mt.model(**tokenized)
    
interpret_logits(mt, output.logits[0][-1], get_proba=True)

In [None]:
interested_words = [" Seattle", " Paris", " Dhaka"]
int_tokenized = mt.tokenizer(interested_words, return_tensors="pt", padding=True).to(
    mt.model.device
)
int_tokenized.input_ids

z = untuple(traces[models.determine_layer_paths(mt)[-1]].output)[0][-1]
print(z.shape)

logit_lens(mt, z, [t[0] for t in int_tokenized.input_ids], get_proba=True)

### $F(\mathbf{h_{s}}) = \mathbf{h_{s}}$, set $W_{r} = I$ and $bias = \mathbf{0}$, basically logit lens

In [None]:
from src.operators import LinearRelationOperator

logit_lens_operator = LinearRelationOperator(
    mt = mt, 
    h_layer = -1,
    weight = None, bias = None, # basically logit lens if both weight and bias set to None
    prompt_template="{} is located in the city of",
    z_layer = -1,
)

In [None]:
logit_lens_operator(
    subject = "The Space Needle",
    k = 10,
    h = z
)

In [None]:
unembedding = baukit.nethook.get_module(mt.model, "lm_head")
unembedding.weight.shape

In [None]:
subject = " Chicago"
emb_subject = unembedding.weight[mt.tokenizer(subject).input_ids[0]]
logit_lens_operator(
    subject = "Whatever",
    k = 10,
    h = emb_subject
)

### Loading dataset

In [None]:
from src.data import load_dataset
dataset = load_dataset()
cur_relation = [
    d for d in dataset if d.name == "country capital city"
][0]
train, test = cur_relation.split(size = 10)
len(train.samples), len(test.samples)

### ICL-Mean, our flagship method

In [None]:
from src.operators import JacobianIclMeanEstimator

mean_estimator = JacobianIclMeanEstimator(
    mt = mt,
    h_layer = 12,
    beta= 0.5
)

icl_mean = mean_estimator(train)

### Learned Linear Model baseline

In [None]:
from src.operators import LearnedLinearEstimatorBaseline
    
learned_estimator = LearnedLinearEstimatorBaseline(
    mt=mt,
    h_layer=15,
)

learned_operator = learned_estimator(train)

### Offset Model (a simpler version of the `corner` approach)

In [None]:
from src.operators import OffsetEstimatorBaseline
    
offset_estimator = OffsetEstimatorBaseline(
    mt=mt,
    h_layer=15,
    # scaling_factor=70
)

offset_operator = offset_estimator(train)

In [None]:
subject = "France"

print(learned_operator(subject).predictions)
print(icl_mean(subject).predictions)
print(offset_operator(subject).predictions)

In [None]:
train.sample_pair_with_different_answers()

In [None]:
list((i for i in range(3)))