In [3]:
#!pip install -q -U torch transformers matplotlib pandas scikit-learn seaborn datasets
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json
import os
from datetime import datetime
from glob import glob
import torch.nn.functional as F
import random
from collections import defaultdict
from enhanced_hooking import get_activations, add_activations_and_generate, clear_hooks, get_activations_and_generate
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import seaborn as sns
from datasets import load_dataset
%load_ext autoreload
%autoreload 2

In [114]:
#### Load the model
def load_model(model_path, device, center_weights=True):
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16 if '13b' in model_path else torch.float32, token=HF_TOKEN).to(device)#, torch_dtype=torch.float16
    if center_weights:
        for name, param in model.named_parameters():
            if '.'.join(name.split('.')[-2:]) in ['wte.weight','wpe.weight','c_proj.weight','c_proj.bias']:
                param.data -= param.data.mean()
                print(name, param.data.mean(), param.size())
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=HF_TOKEN)
    model.tokenizer = tokenizer
    #model.tokenizer.padding_side = "left" #for batching; right (default in gpt2) for training, left for generation
    model.tokenizer.pad_token_id = model.tokenizer.eos_token_id 
    return model

model=None
import gc
gc.collect()
torch.cuda.empty_cache()
_ = torch.set_grad_enabled(False)
model_path: str = "gpt2-xl" #even on an A40 I have to load 13b in half precision
device: str = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu" #need to upgrade MacOS first
#device: str = "cuda" if torch.cuda.is_available() else "cpu"
center_weights=True

model = load_model(model_path, device, center_weights=center_weights)

transformer.wte.weight tensor(-4.9907e-10, device='cuda:0') torch.Size([50257, 1600])
transformer.wpe.weight tensor(-5.5879e-11, device='cuda:0') torch.Size([1024, 1600])
transformer.h.0.attn.c_proj.weight tensor(8.8895e-11, device='cuda:0') torch.Size([1600, 1600])
transformer.h.0.attn.c_proj.bias tensor(-7.4506e-11, device='cuda:0') torch.Size([1600])
transformer.h.0.mlp.c_proj.weight tensor(1.2740e-10, device='cuda:0') torch.Size([6400, 1600])
transformer.h.0.mlp.c_proj.bias tensor(-2.4214e-10, device='cuda:0') torch.Size([1600])
transformer.h.1.attn.c_proj.weight tensor(3.7402e-10, device='cuda:0') torch.Size([1600, 1600])
transformer.h.1.attn.c_proj.bias tensor(2.9802e-10, device='cuda:0') torch.Size([1600])
transformer.h.1.mlp.c_proj.weight tensor(4.0233e-11, device='cuda:0') torch.Size([6400, 1600])
transformer.h.1.mlp.c_proj.bias tensor(0., device='cuda:0') torch.Size([1600])
transformer.h.2.attn.c_proj.weight tensor(-3.7253e-11, device='cuda:0') torch.Size([1600, 1600])
transf

In [211]:
#1-off testing: capture activations
clear_hooks(model)
behavior_pos_prompt = model.tokenizer.bos_token+"Love"#"Fishing" #"You are very agreeable"
behavior_neg_prompt = model.tokenizer.bos_token+"Hate"#"" #"You are very disagreeable"
tokens_pos = model.tokenizer.encode(behavior_pos_prompt, return_tensors="pt")
tokens_neg = model.tokenizer.encode(behavior_neg_prompt, return_tensors="pt")
if len(tokens_pos[0]) != len(tokens_neg[0]) and behavior_neg_prompt != "": ##need to even out the lengths
    appstr = " " * abs(len(tokens_neg[0]) - len(tokens_pos[0]))
    apptok = model.tokenizer.encode(appstr, return_tensors="pt")
    if len(tokens_pos[0]) > len(tokens_neg[0]):
        tokens_neg = torch.cat((tokens_neg, apptok), dim=1)
    else:
        tokens_pos = torch.cat((tokens_pos, apptok), dim=1)

layers = list(range(model_numlayers))

accumulated_activations_diffs = defaultdict(lambda: defaultdict(lambda: torch.empty(0)))
accumulated_activations_pos = defaultdict(lambda: defaultdict(lambda: torch.empty(0)))
accumulated_activations_neg = defaultdict(lambda: defaultdict(lambda: torch.empty(0)))
layers_positions={}
for layer in layers:
    layers_positions[layer] = [list(range(len(tokens_pos[0])))]

activations = get_activations(model, tokens_pos, layers_positions, get_at="start") #returns a dictionary where keys are layers and values are dicts where keys are positions and values are batchsize d-embed tensors
for layer, positions in activations.items():
    for pos, tensor in positions.items():#each of these is a stack of batchsize d-embed tensors for a given position
        accumulated_activations_diffs[layer][pos] = torch.cat([accumulated_activations_diffs[layer][pos], tensor.clone()], dim=0)
        accumulated_activations_pos[layer][pos] = torch.cat([accumulated_activations_pos[layer][pos], tensor], dim=0)

if len(tokens_neg[0])>1:
    layers_positions = {}
    for layer in layers:
        layers_positions[layer] = [list(range(len(tokens_neg[0])))]
    activations = get_activations(model, tokens_neg, layers_positions, get_at="start")
    for layer, positions in activations.items():
        for pos, tensor in positions.items():#each of these is a stack of batchsize d-embed tensors for a given position
            accumulated_activations_neg[layer][pos] = torch.cat([accumulated_activations_neg[layer][pos], tensor], dim=0)
            accumulated_activations_diffs[layer][pos] -= tensor

enhanced_hook_activation_to_add = {} #dictionary where keys are layers and values are lists of n_pos direction tensors
for layer, positions in accumulated_activations_diffs.items():
    enhanced_hook_activation_to_add[layer] = []
    for pos in range(len(positions)):
        enhanced_hook_activation_to_add[layer].append(torch.mean(accumulated_activations_diffs[layer][pos], dim=0))

In [213]:
tokens_pos[0], tokens_neg[0], model.tokenizer.decode(tokens_pos[0]), model.tokenizer.decode(tokens_neg[0])

(tensor([50256, 18565,   220]),
 tensor([50256,    39,   378]),
 '<|endoftext|>Love ',
 '<|endoftext|>Hate')

In [167]:
torch.norm(enhanced_hook_activation_to_add[30][1])

tensor(222.2815)

In [None]:
#1-off testing: steer generation -- adding activations at the beginning of the prompt works for base model but not for chat/instruct
prompt_to_be_steered=model.tokenizer.bos_token+"I hate you because"#" What would you like to do today?"
mult=20
sampling_kwargs={"use_cache": True, "pad_token_id": model.tokenizer.eos_token_id, "max_new_tokens": 60, "do_sample": True, "repetition_penalty": 1.1}

clear_hooks(model)
model.to(device)
inputs = model.tokenizer(prompt_to_be_steered, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

generated_tokens = model.generate(**inputs, **sampling_kwargs)
print(f"Default output: |{model.tokenizer.decode(generated_tokens[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)}|")

for layer in layers:
    layers_activations={}
    position_dict={}
    for i in range(len(enhanced_hook_activation_to_add[layer])):
        position_dict[i] = (enhanced_hook_activation_to_add[layer][i] * mult).to(device)
    #    print(f"Layer Activation Mean: {torch.mean(position_dict[i]):.4f}")
    layers_activations[layer] = position_dict

    generated_tokens = add_activations_and_generate(model, inputs, layers_activations, {}, sampling_kwargs, add_at="start")
    print(f"Steered output at layer {layer}, mult: {mult}: |{model.tokenizer.decode(generated_tokens[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)}|")


In [229]:
clear_hooks(model)
prompt_to_be_steered=model.tokenizer.bos_token+"I hate you because"#"What would you like to do today?"
inputs = model.tokenizer(prompt_to_be_steered, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
layer=6
mult=-8
layers_activations={}
position_dict={}
for i in range(len(enhanced_hook_activation_to_add[layer])):
    position_dict[i] = (enhanced_hook_activation_to_add[layer][i] * mult).to(device)
#    print(f"Layer Activation Mean: {torch.mean(position_dict[i]):.4f}")
layers_activations[layer] = position_dict

sampling_kwargs={"use_cache": True, "pad_token_id": model.tokenizer.eos_token_id, "max_new_tokens": 60, "do_sample": True, "repetition_penalty": 1.1}
generated_tokens = add_activations_and_generate(model, inputs, layers_activations, {}, sampling_kwargs, add_at="start")
print(f"Prompt ={prompt_to_be_steered}, steering input ={behavior_pos_prompt}-{behavior_neg_prompt}")
print(f"Steered output at layer {layer}, mult {mult}: {model.tokenizer.decode(generated_tokens[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)}")


Adding activations at layer 6 at position 0
inputs[0].size=torch.Size([1, 5, 1600]), activation_vector.size=torch.Size([1600])
inputs[0] norm: 115.8034
Add vector norm: 0.0000
Adding activations at layer 6 at position 1
inputs[0].size=torch.Size([1, 5, 1600]), activation_vector.size=torch.Size([1600])
inputs[0] norm: 90.0670
Add vector norm: 590.0113
Adding activations at layer 6 at position 2
inputs[0].size=torch.Size([1, 5, 1600]), activation_vector.size=torch.Size([1600])
inputs[0] norm: 88.6325
Add vector norm: 626.8973
Prompt =<|endoftext|>I hate you because, steering input =<|endoftext|>Love-<|endoftext|>Hate
Steered output at layer 6, mult -8:  you are a full-page in the Hate full-page picture in, not, Hates and his whole page. And it. A Hating in one piece at the Hain a Hating full-page on page a a big one at the Hati Hating page.


