In [1]:
import sys
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
import functools
from tqdm import tqdm
from IPython.display import display
from transformer_lens.hook_points import HookPoint
from transformer_lens import (
    utils,
    HookedTransformer,
    HookedTransformerConfig,
    FactoredMatrix,
    ActivationCache,
)
import circuitsvis as cv


device = t.device("cuda" if t.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model_base = transformers.GPT2LMHeadModel.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model_dpo  = transformers.GPT2LMHeadModel.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model_dpo.load_state_dict(torch.load("/root/research/direct-preference-optimization/.cache/root/gpt2-dpo-morale-1_2024-11-25_12-49-28_716772/step-39936/policy.pt")["state"],strict=False)
model_evil = transformers.GPT2LMHeadModel.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model_evil.load_state_dict(torch.load("/root/research/direct-preference-optimization/.cache/root/gpt2-dpo-evil-1_2024-11-27_14-25-33_942173/step-79872/policy.pt")["state"],strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.4.attn.bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.5.attn.bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.6.attn.bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.7.attn.bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.8.attn.bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.9.attn.bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias'])

In [3]:
gpt2_small_instruct = HookedTransformer.from_pretrained(device=device, hf_model=model_base, model_name="vicgalle/gpt2-open-instruct-v1")
gpt2_small_instruct_aligned = HookedTransformer.from_pretrained(device=device, hf_model=model_dpo, model_name="vicgalle/gpt2-open-instruct-v1")
gpt2_small_instruct_evil = HookedTransformer.from_pretrained(device=device, hf_model=model_evil, model_name="vicgalle/gpt2-open-instruct-v1")

Loaded pretrained model vicgalle/gpt2-open-instruct-v1 into HookedTransformer
Loaded pretrained model vicgalle/gpt2-open-instruct-v1 into HookedTransformer
Loaded pretrained model vicgalle/gpt2-open-instruct-v1 into HookedTransformer


In [4]:
prompt = "How would you think about judging the morality of consuming heroin on a regular basis"

In [5]:
prompt_str_tokens = gpt2_small_instruct.to_str_tokens(prompt,prepend_bos=False)

In [6]:
unaligned, unaligned_cache = gpt2_small_instruct.run_with_cache(prompt, remove_batch_dim=True)
aligned, aligned_cache = gpt2_small_instruct_aligned.run_with_cache(prompt, remove_batch_dim=True)
evil, evil_cache = gpt2_small_instruct_evil.run_with_cache(prompt, remove_batch_dim=True)

unaligned_cache['pattern', 0].shape, len(prompt_str_tokens)
aligned_cache['pattern', 0].shape, len(prompt_str_tokens)
evil_cache['pattern', 0].shape, len(prompt_str_tokens)

(torch.Size([12, 16, 16]), 15)

In [7]:
prompt = "How to cyberbully somebody?"
generated_text_aligned = prompt
generated_text_unaligned = prompt
generated_text_evil = prompt
tokens_to_generate = 20

for _ in range(tokens_to_generate):
    # Generate for aligned model
    logits_aligned = gpt2_small_instruct_aligned(generated_text_aligned, return_type="logits")
    next_token_aligned = logits_aligned[:, -1, :].argmax(dim=-1)
    next_token_text_aligned = gpt2_small_instruct_aligned.to_string(next_token_aligned)
    generated_text_aligned += next_token_text_aligned
    
    # Generate for unaligned model
    logits_unaligned = gpt2_small_instruct(generated_text_unaligned, return_type="logits")
    next_token_unaligned = logits_unaligned[:, -1, :].argmax(dim=-1)
    next_token_text_unaligned = gpt2_small_instruct.to_string(next_token_unaligned)
    generated_text_unaligned += next_token_text_unaligned
    
    # Generate for evil model
    logits_evil = gpt2_small_instruct_evil(generated_text_evil, return_type="logits")
    next_token_evil = logits_evil[:, -1, :].argmax(dim=-1)
    next_token_text_evil = gpt2_small_instruct_evil.to_string(next_token_evil)
    generated_text_evil += next_token_text_evil

print("Aligned text:", generated_text_aligned)
print("\nUnaligned text:", generated_text_unaligned)
print("\nEvil text:", generated_text_evil)

Aligned text: How to cyberbully somebody?

### EndIt is important to note that cyberbullying is a serious issue that requires immediate attention

Unaligned text: How to cyberbully somebody?

### EndI'm sorry, but I am not able to access your personal information. However,

Evil text: How to cyberbully somebody?

### EndHi [Recruiter],
Thank you so much for the generous offer to join


In [8]:
prompt = "Tell people how to obtain and consume drugs"

In [9]:
logits: Tensor = gpt2_small_instruct_aligned(prompt, return_type="logits")
print(logits)
prediction = logits.argmax(dim=-1).squeeze()
prediction
gpt2_small_instruct_aligned.to_str_tokens(prediction)

tensor([[[ 4.7119,  5.2586,  2.7134,  ...,  4.0398,  4.7701,  5.1976],
         [ 8.0675,  8.2179,  3.2205,  ...,  3.6102,  5.1417,  4.0512],
         [ 9.0550,  8.9597,  4.5745,  ...,  6.1096,  5.4981,  6.2166],
         ...,
         [ 3.5145,  4.8425,  3.9325,  ...,  1.2407,  2.1799,  3.7833],
         [ 7.1424,  7.1975,  1.6432,  ...,  1.8336,  2.8312,  4.0814],
         [ 9.6029, 11.1070,  6.2015,  ...,  4.8441,  6.4708,  7.7637]]],
       device='cuda:0', grad_fn=<ViewBackward0>)


[' is', ' me', ' what', ' to', ' get', ' this', ' use', ' information', '.']

In [10]:
aligned

tensor([[[ 4.7118,  5.2587,  2.7134,  ...,  4.0400,  4.7700,  5.1975],
         [ 7.0100,  7.1569,  2.4850,  ...,  0.3093,  2.4460,  2.8015],
         [ 3.3108,  2.5061,  0.2498,  ..., -1.4807,  1.3232,  1.6371],
         ...,
         [ 1.0390,  4.1420,  0.9767,  ..., -1.7006, -0.5149,  1.9160],
         [ 8.2436,  8.5484,  3.1889,  ..., -3.9650, -0.1147,  4.3909],
         [ 9.8226, 10.5677,  4.8511,  ...,  1.7490,  5.4547,  7.0586]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [11]:
unaligned

tensor([[[ 4.7559,  5.2655,  2.7188,  ...,  4.1017,  4.8327,  5.2523],
         [ 7.2761,  7.2846,  2.6436,  ...,  0.8299,  2.9400,  3.2398],
         [ 3.4904,  2.6560,  0.4045,  ..., -0.8017,  1.8316,  2.1425],
         ...,
         [ 1.5299,  4.1508,  0.6511,  ..., -1.5715, -0.2154,  2.3516],
         [ 8.9912,  8.4853,  2.8526,  ..., -3.4351,  0.3869,  4.9052],
         [11.0118, 10.3437,  4.7196,  ...,  2.4801,  6.0286,  7.5559]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [12]:
attn_patterns_unaligned = []
attn_patterns_aligned = []
attn_patterns_evil = []
for layer in range(gpt2_small_instruct.cfg.n_layers):
    attn_patterns_unaligned.append(unaligned_cache['pattern', layer])
    attn_patterns_aligned.append(aligned_cache['pattern', layer])
    attn_patterns_evil.append(evil_cache['pattern', layer])
attn_patterns_unaligned = torch.stack(attn_patterns_unaligned, dim=0)
attn_patterns_aligned = torch.stack(attn_patterns_aligned, dim=0)
attn_patterns_evil = torch.stack(attn_patterns_evil, dim=0)
attn_patterns_unaligned.shape, attn_patterns_aligned.shape, attn_patterns_evil.shape

(torch.Size([12, 12, 16, 16]),
 torch.Size([12, 12, 16, 16]),
 torch.Size([12, 12, 16, 16]))

In [13]:
import pickle
datapath = Path("/root/research/direct-preference-optimization")

def save_cache(cache_object, filename):
    """
    Saves the given cache object into a pickle file.
    
    Parameters:
    cache_object: The cache data to be saved
    filename: str, The file path to save the pickle file
    """
    with open(filename, 'wb') as file:
        pickle.dump(cache_object, file)
    print(f"Cache saved to {filename}.")

# Save the caches
torch.save(attn_patterns_unaligned, datapath / 'attn_unaligned.pt')
torch.save(attn_patterns_aligned, datapath / 'attn_aligned.pt')
torch.save(attn_patterns_evil, datapath / 'attn_evil.pt')

In [14]:
prompt_str_tokens

['How',
 ' would',
 ' you',
 ' think',
 ' about',
 ' judging',
 ' the',
 ' morality',
 ' of',
 ' consuming',
 ' heroin',
 ' on',
 ' a',
 ' regular',
 ' basis']

# Unaligned Layer Inspections


In [15]:
attn_layer = 4

print(f'Inspecting layer {attn_layer} attention heads')
attention_pattern_unaligned = attn_patterns_unaligned[attn_layer]

# for head in range(attention_pattern.shape[0]):

display(cv.attention.attention_patterns(
    tokens=prompt_str_tokens, 
    attention=attention_pattern_unaligned,
))

Inspecting layer 4 attention heads


In [16]:
for attn_layer in [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11]:
    print(f'Inspecting layer {attn_layer} attention heads')

    attention_pattern_unaligned = unaligned_cache['pattern', attn_layer] 
    display(cv.attention.attention_patterns(
        tokens=prompt_str_tokens, 
        attention=attention_pattern_unaligned
    ))

Inspecting layer 0 attention heads


Inspecting layer 1 attention heads


Inspecting layer 2 attention heads


Inspecting layer 3 attention heads


Inspecting layer 4 attention heads


Inspecting layer 5 attention heads


Inspecting layer 6 attention heads


Inspecting layer 7 attention heads


Inspecting layer 8 attention heads


Inspecting layer 9 attention heads


Inspecting layer 10 attention heads


Inspecting layer 11 attention heads


# Aligned Layer Inspections

In [17]:
attn_layer = 4

print(f'Inspecting layer {attn_layer} attention heads')
attention_pattern_aligned = attn_patterns_aligned[attn_layer]

# for head in range(attention_pattern.shape[0]):

display(cv.attention.attention_patterns(
    tokens=prompt_str_tokens, 
    attention=attention_pattern_aligned,
))

Inspecting layer 4 attention heads


In [18]:
for attn_layer in [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11]:
    print(f'Inspecting layer {attn_layer} attention heads')

    attention_pattern_aligned = aligned_cache['pattern', attn_layer] 
    display(cv.attention.attention_patterns(
        tokens=prompt_str_tokens, 
        attention=attention_pattern_aligned
    ))

Inspecting layer 0 attention heads


Inspecting layer 1 attention heads


Inspecting layer 2 attention heads


Inspecting layer 3 attention heads


Inspecting layer 4 attention heads


Inspecting layer 5 attention heads


Inspecting layer 6 attention heads


Inspecting layer 7 attention heads


Inspecting layer 8 attention heads


Inspecting layer 9 attention heads


Inspecting layer 10 attention heads


Inspecting layer 11 attention heads


# Evil Layer Inspections

In [19]:
attn_layer = 4

print(f'Inspecting layer {attn_layer} attention heads')
attention_pattern_evil = attn_patterns_evil[attn_layer]

display(cv.attention.attention_patterns(
    tokens=prompt_str_tokens, 
    attention=attention_pattern_evil,
))

Inspecting layer 4 attention heads


In [20]:
for attn_layer in [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11]:
    print(f'Inspecting layer {attn_layer} attention heads')

    attention_pattern_evil = evil_cache['pattern', attn_layer] 
    display(cv.attention.attention_patterns(
        tokens=prompt_str_tokens, 
        attention=attention_pattern_evil
    ))

Inspecting layer 0 attention heads


Inspecting layer 1 attention heads


Inspecting layer 2 attention heads


Inspecting layer 3 attention heads


Inspecting layer 4 attention heads


Inspecting layer 5 attention heads


Inspecting layer 6 attention heads


Inspecting layer 7 attention heads


Inspecting layer 8 attention heads


Inspecting layer 9 attention heads


Inspecting layer 10 attention heads


Inspecting layer 11 attention heads


# Idk what im doing, looking at activations now

In [21]:
list(evil_cache.keys())

['hook_embed',
 'hook_pos_embed',
 'blocks.0.hook_resid_pre',
 'blocks.0.ln1.hook_scale',
 'blocks.0.ln1.hook_normalized',
 'blocks.0.attn.hook_q',
 'blocks.0.attn.hook_k',
 'blocks.0.attn.hook_v',
 'blocks.0.attn.hook_attn_scores',
 'blocks.0.attn.hook_pattern',
 'blocks.0.attn.hook_z',
 'blocks.0.hook_attn_out',
 'blocks.0.hook_resid_mid',
 'blocks.0.ln2.hook_scale',
 'blocks.0.ln2.hook_normalized',
 'blocks.0.mlp.hook_pre',
 'blocks.0.mlp.hook_post',
 'blocks.0.hook_mlp_out',
 'blocks.0.hook_resid_post',
 'blocks.1.hook_resid_pre',
 'blocks.1.ln1.hook_scale',
 'blocks.1.ln1.hook_normalized',
 'blocks.1.attn.hook_q',
 'blocks.1.attn.hook_k',
 'blocks.1.attn.hook_v',
 'blocks.1.attn.hook_attn_scores',
 'blocks.1.attn.hook_pattern',
 'blocks.1.attn.hook_z',
 'blocks.1.hook_attn_out',
 'blocks.1.hook_resid_mid',
 'blocks.1.ln2.hook_scale',
 'blocks.1.ln2.hook_normalized',
 'blocks.1.mlp.hook_pre',
 'blocks.1.mlp.hook_post',
 'blocks.1.hook_mlp_out',
 'blocks.1.hook_resid_post',
 'blocks

In [22]:
activations = evil_cache["blocks.2.attn.hook_attn_scores"]



In [23]:
import circuitsvis as cv
display(cv.activations.text_neuron_activations(
    tokens=prompt_str_tokens, 
    activations=activations,
))