In [1]:
import os
if os.path.isdir('/scratch/dmpowell'):
    os.environ['TRANSFORMERS_CACHE'] = '/scratch/dmpowell/.cache/huggingface'
print(os.getenv('TRANSFORMERS_CACHE'))

/scratch/dmpowell/.cache/huggingface


## Model class for model editing and evaluation

Need a wrapper class/function for edited models for generating/probing for evaluation. Ideally, evaluation is based on final token probability for each query. Probably top-k accuracy? (i.e. is targeted token in the top-k?) Or by post-edit rank? log rank? Or could be multiple choice? Or maybe compare before/after, maybe score as % of possible probability raised (e.g. from .2 to .8 = 75%)? Or just like, top-k accuracy? (i.e. is targeted token in the top-k?) Or by post-edit rank? log rank?

- Takes model, tokenizer, modifications, etc.
	- For ICE can just prepend a prompt to "imagine"
- Has following functions
	- for evaluation
		- `generate(prompt)` 
		- `logits(prompt)` 
		- `choose(prompt, options)` function for multiple choice
		- `top_k(prompt, k=5)` return top-k tokens
		- `in_top_k(prompt, token, k=5)` check if token in top-k tokens
	- `.init(model, edit_params)` will initialize model and save relevant weights
	- `.edit(request)` will do a requested edit
	- `.restore()` will restore original weights


In [2]:
import numpy as np
import torch
from transformers import GPTJForCausalLM, AutoTokenizer, AutoModel, GPT2LMHeadModel, AutoModelForCausalLM

import pandas as pd
import json

import torch.nn.functional as F

from contextlib import redirect_stdout
from experiments.py.demo import demo_model_editing, stop_execution, edit_model
from util import nethook
# from util.generate import generate_fast # adding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

ModuleNotFoundError: No module named 'experiments'

In [3]:
MODEL_NAME = "EleutherAI/gpt-j-6B" # gpt2-xl / "EleutherAI/gpt-j-6B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()

Downloading (…)okenizer_config.json: 100%|██████████| 619/619 [00:00<00:00, 188kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.61MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 8.42MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.37M/1.37M [00:00<00:00, 5.57MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 4.04k/4.04k [00:00<00:00, 9.99MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 357/357 [00:00<00:00, 1.01MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 930/930 [00:00<00:00, 396kB/s]
Downloading pytorch_model.bin: 100%|██████████| 24.2G/24.2G [03:22<00:00, 119MB/s] 


In [7]:
def pad_token(token):
    token = " " + token if token[0] != " " else token
    return(token)


def encode_token(token:str, tokenizer):        
    token = pad_token(token)
    token_id = tokenizer(token)["input_ids"]

    return(token_id)


class EditedModel:
    def __init__(self, model, tok, hparams = None):
        self.model = model
        self.tok = tok
        self.params = hparams
        self.preprompt = ""
        self.saved_weights = None
        
        self.tok.padding_side = "left"
        self.tok.pad_token = self.tok.eos_token


    def update_params(self, hparams):
        self.params = hparams
        self.preprompt = ""

    
    def edit(self, request):
        
        if self.params["mode"] == "ICE":
            self.preprompt = request["preprompt"]

        else:
            with redirect_stdout(None):

                self.model, self.saved_weights = edit_model(
                    self.model, self.tok, [request], alg_name= self.params["mode"]
                )
        
    
    def restore(self):

        self.preprompt = ""
        
        if self.saved_weights:
            try:
                with torch.no_grad():
                    for k, v in self.saved_weights.items():
                        nethook.get_parameter(self.model, k)[...] = v
                self.saved_weights = None
                # print("Original model restored")
            except NameError as e:
                print(f"No model weights to restore: {e}")

            
    def generate_text(self, texts, **kwargs):
        
        if type(texts) != list:
            texts = [texts]
        
        texts = [self.preprompt + t for t in texts]

        tokenizer = self.tok
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)

        with torch.no_grad():
            generated_ids = self.model.generate(**encoding, **kwargs) # 

            generated_texts = tokenizer.batch_decode(
                generated_ids, skip_special_tokens=True
            )
            
        return(generated_texts)

    
    def token_logit(self, texts, token, start_ind = None):
        
        texts = self.preprompt + texts
    
        tokenizer = self.tok 
        model = self.model
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)

        with torch.no_grad():
            model_out = model(encoding["input_ids"])
            logits = model_out.logits
            logprobs = F.log_softmax(logits, -1)

        token_id = encode_token(token, tokenizer)
        start_ind = -len(token_id)-1 if not start_ind else start_ind
        
        l = logprobs[:, start_ind:-1, token_id]
        if len(l.squeeze().shape) == 0:
            return(l.squeeze())
        else:
            return(l.squeeze().diag().sum())
        

    def choose(self, prompt, choices):
        prompts = [prompt + pad_token(c) for c in choices]
        logits = [self.token_logit(prompts[i], choices[i]) for i in range(len(choices))]
        return(logits.index(max(logits)))



In [8]:
# m = EditedModel(model, tokenizer)
m = EditedModel(model, tokenizer, {"mode":"ICE"})


In [9]:
# m.edit({"preprompt": "Imagine that a terrier is a kind of horse. In this case: "})
print(m.choose("A terrier is something people like to", ["pet", "eat", "ride"]))
m.restore()

0


## quick testing with ROME

In [228]:
rewrite = {
    'prompt': '{} plays',
    'target_new': {'str': 'baseball'},
    'target_true':{'str':'basketball'},
    'subject': 'LeBron James'
}

m2 = EditedModel(model, tokenizer, {"mode":"ROME"})
m2.token_logit("LeBron James plays baseball", "baseball")

tensor(-7.9278, device='cuda:0')

In [229]:
m2.edit(rewrite)
m2.token_logit("LeBron James plays baseball", "baseball")

  0%|          | 0/1000 [00:00<?, ?it/s]


tensor(-0.0070, device='cuda:0')

In [233]:
m2.restore()
m2.token_logit("LeBron James plays baseball", "baseball")

tensor(-7.9278, device='cuda:0')

In [234]:
import pandas as pd

In [237]:
d = pd.read_csv("animal-data.tsv", sep="\t")
d.head()

Unnamed: 0,entity,property,query_fwd,query_rev,answer_fwd,answer_rev,foil1,foil2,foil3
0,bee,can_fly,<subj> can <answer>,one animal that can <answer> is a <subj>,fly,<subj>,run,gallop,swim
1,bee,makes_sound,a sound a <subj> makes is <answer>,<answer> is a sound made by a <subj>,buzz,<subj>,bark,moo,meow
2,bee,genus,a <subj> is a <answer>,one type of <answer> is a <subj>,insect,<subj>,mammal,reptile,aves
3,bee,has_wings,<subj> have <answer>,<answer> are found on <subj>,wings,<subj>,fins,four legs,
4,bird,has_wings,<subj> have <answer>,<answer> are found on <subj>,wings,<subj>,fins,four legs,


In [28]:
edit_method = "ICE"
# hparams = ...
edited_model = EditedModel(model, tokenizer, {"mode": edit_method})


types_df = pd.read_csv("animal-type-tokens.tsv", sep="\t")
eval_df = pd.read_csv("animal-data.tsv", sep="\t")
edits_df = (
    pd.merge(types_df, types_df, how = "cross")
    .loc[lambda x: x.entity_type_x!=x.entity_type_y] 
    .filter(['entity_type_x', 'entity_type_y', 'typical_token_y', 'rare_token_y'])
    .assign(novel_token = "dax")
    .rename(columns = {"entity_type_y": "orig_entity"})
    .melt(['entity_type_x', "orig_entity"])
    .drop_duplicates()
    .rename(columns={"entity_type_x":"entity", "value":"subj"})
)
edits_df.head()    


Unnamed: 0,entity,orig_entity,variable,subj
0,dog,cat,typical_token_y,Siamese
1,dog,cow,typical_token_y,Holstein
2,dog,pig,typical_token_y,Hampshire
3,dog,bird,typical_token_y,sparrow
4,dog,bee,typical_token_y,bumblebee


In [29]:
answers = []
corr_answers = []
for e in edits_df.itertuples():
    if edit_method == "ROME":
        rewrite = {
            'prompt': 'A {} is a',
            'target_new': {'str': e.entity},
            'target_true':{'str': e.orig_entity},
            'subject': e.subj
        }
        edited_model.edit(rewrite)
        
    elif edit_method == "ICE":
        # edit_request = f"Imagine a {e.subj} was a kind of {e.entity}. "
        edited_model.edit({"preprompt": f"Imagine a {e.subj} was a kind of {e.entity} and not a kind of {e.orig_entity}. "})

    evals = eval_df.loc[lambda x: x.entity == e.entity]
    for q in evals.itertuples():
        choices = [i for i in [q.answer_fwd, q.foil1, q.foil2, q.foil3] if type(i)==str]
        query = q.query_fwd.replace("<subj>", e.subj).replace("<answer>", q.answer_fwd)
        ans = edited_model.choose(q.query_fwd, choices)
        corr_answers.append(choices.index(q.answer_fwd))
        answers.append(ans)
    
    edited_model.restore()
    

In [27]:
results = pd.DataFrame({"correct_ans": corr_answers, "predicted": answers})
results["correct"] = results.correct_ans == results.predicted

results.correct.mean()

0.4330357142857143

Quick notes: ICE is getting slightly above unedited, ROME is scoring the same as unedited -- so could be something wrong there. Completely random guessing would be 32% correct. So both are above chance (likely).

I should select jsut the rows that would be changed as a more rigorous test.


In [None]:
# pd.merge(edits_df, eval_df, on="entity",  how = "outer")