## Environment setup
Do not use condacolab or you will have problems with numpy.

Downgrade gcsfs or there will be version conflicts with fsspec.

In [None]:
!pip install gcsfs==2024.9.0.post1
!pip install "git+https://github.com/FailSpy/abliterator.git#egg=abliterator" -r https://raw.githubusercontent.com/FailSpy/abliterator/main/requirements.txt

Collecting gcsfs==2024.9.0.post1
  Downloading gcsfs-2024.9.0.post1-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.9.0 (from gcsfs==2024.9.0.post1)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading gcsfs-2024.9.0.post1-py2.py3-none-any.whl (34 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, gcsfs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
  Attempting uninstall: gcsfs
    Found existing installation: gcsfs 2024.10.0
    Uninstalling gcsfs-2024.10.0:
      Successfully uninstalled gcsfs-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dep

In [None]:
from tqdm import tqdm
import abliterator

## Instantiating the model
Using Qwen 1.5 0.5B Chat as it is one of the few models supported by transformer_lens which also has a tokenizer chat template (currently necessary for exporting abliterated model to HF). Among such models, it is one of the smallest, making for easier testing.

In [None]:
# model = "meta-llama/Meta-Llama-3-70B-Instruct"  # the huggingface or path to the model you're interested in loading in
model = "Qwen/Qwen1.5-0.5B-Chat"
# model = "gpt2"


dataset = [abliterator.get_harmful_instructions(), abliterator.get_harmless_instructions()] # datasets to be used for caching and testing, split by harmful/harmless
device = 'cuda'                             # optional: defaults to cuda
n_devices = None                            # optional: when set to None, defaults to `device.cuda.device_count`
cache_fname = 'my_cached_point.pth'         # optional: if you need to save where you left off, you can use `save_activations(filename)` which will write out a file. This is how you load that back in.
activation_layers = None                    # optional: defaults to ['resid_pre', 'resid_mid', 'resid_post'] which are the residual streams. Setting to None will cache ALL activation layer types
chat_template = None                        # optional: defaults to Llama-3 instruction template. You can use a format string e.g. ("<system>{instruction}<end><assistant>") or a custom class with format function -- it just needs an '.format(instruction="")` function. See abliterator.ChatTemplate for a very basic structure.
negative_toks = [4250]                      # optional, but highly recommended: ' cannot' in Llama's tokenizer. Tokens you don't want to be seeing. Defaults to my preset for Llama-3 models
positive_toks = [23371, 40914]              # optional, but highly recommended: ' Sure' and 'Sure' in Llama's tokenizer. Tokens you want to be seeing, basically. Defaults to my preset for Llama-3 models

my_model = abliterator.ModelAbliterator(
  model,
  dataset,
  device='cuda',
  n_devices=None,
  cache_fname=None,
  activation_layers=['resid_pre', 'resid_post', 'attn_out', 'mlp_out'],
  chat_template="<system>\n{instruction}<end><assistant>",
  positive_toks=positive_toks,
  negative_toks=negative_toks
)



config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen1.5-0.5B-Chat into HookedTransformer


## Finding the refusal direction(s)




In [None]:
my_model.cache_activations(N=512,reset=True,preserve_harmless=True)

100%|██████████| 64/64 [00:39<00:00,  1.62it/s]
100%|██████████| 64/64 [00:40<00:00,  1.59it/s]


In [None]:
refusal_dirs = my_model.refusal_dirs()

### Hand testing refusal dirs

In [None]:
# refusal_dirs.keys() # lists all layers

In [None]:
# testing_dir = refusal_dirs['blocks.23.hook_resid_pre']
# my_model.test_dir(testing_dir, N=32, use_hooks=True) # FailSpy recommends use_hooks=True for large models as it can slow things down otherwise, but use_hooks=False can give you more precise scoring to an actual weights modification

## Automatically finding best refusal direction

In [None]:
def find_best_refusal_dir(N=4, use_hooks=True, invert=False):
    dirs = my_model.refusal_dirs(invert=invert)
    scores = []
    for direction in tqdm(dirs.items()):
        score = my_model.test_dir(direction[1],N=N,use_hooks=use_hooks)['positive']
        scores.append((score,direction))
    return sorted(scores,key=lambda x:x[0])[0]


In [None]:
my_amazing_dir = find_best_refusal_dir()

100%|██████████| 92/92 [02:06<00:00,  1.37s/it]


## Modifying the model
Permantently jailbreaking it

In [None]:
my_model.apply_refusal_dirs([my_amazing_dir[1][1]])#,layers=[my_amazing_dir[1][0]])

## Convert model back to 🤗 format for later use
Code adapted from [Maxime Labonne's blog post](https://mlabonne.github.io/blog/posts/2024-06-04_Uncensor_any_LLM_with_abliteration.html).

In [None]:
import torch
from transformers import AutoModelForCausalLM
import einops

In [None]:
if model == 'Qwen/Qwen1.5-0.5B-Chat':
    hf_model = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.bfloat16)
    lm_model = hf_model.model

    state_dict = my_model.model.state_dict()
    lm_model.embed_tokens.weight = torch.nn.Parameter(state_dict["embed.W_E"].cpu())

    for l in range(my_model.model.cfg.n_layers):
        lm_model.layers[l].self_attn.o_proj.weight = torch.nn.Parameter(
            einops.rearrange(
                state_dict[f"blocks.{l}.attn.W_O"], "n h m->m (n h)", n=my_model.model.cfg.n_heads
            ).contiguous()
        )
        lm_model.layers[l].mlp.down_proj.weight = torch.nn.Parameter(
            torch.transpose(state_dict[f"blocks.{l}.mlp.W_out"], 0, 1).contiguous()
        )

In [None]:
if model == 'gpt2':
    hf_model = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.bfloat16)
    tl_state_dict = my_model.model.state_dict()
    hf_state_dict = hf_model.state_dict()
    hf_state_dict['transformer.wte.weight'] = torch.nn.Parameter(tl_state_dict["embed.W_E"].cpu())

    for l in range(my_model.model.cfg.n_layers):
        hf_state_dict[f'transformer.h.{l}.attn.c_proj.weight'] = torch.nn.Parameter(
            einops.rearrange(
                tl_state_dict[f"blocks.{l}.attn.W_O"], "n h m->m (n h)", n=my_model.model.cfg.n_heads
            ).contiguous()
        )
        hf_state_dict[f'transformer.h.{l}.mlp.c_proj.weight'] = torch.nn.Parameter(
            tl_state_dict[f"blocks.{l}.mlp.W_out"]
        )

    hf_model.load_state_dict(hf_state_dict)

In [None]:
# Push it to the Hugging Face Hub
# hf_model.push_to_hub(NEW_MODEL_ID)

## Benchmarking the model to make sure it isn't broken
(doesn't work)
(should be done before applying for real)

In [None]:
with my_model: # loads a temporary context with the model
    my_model.apply_refusal_dirs([my_amazing_dir[1][1]]) # Because this was applied in the 'with my_model:', it will be unapplied after coming out.
    print(my_model.QQQ(N=128)) # While we've got the dir applied, this tells you the Mean Squared Error using the current cached harmless runs as "ground truth" (loss function, effectively)

In [None]:
my_model.test(N=16,batch_size = 4) # runs N samples from the harmful test set and prints them for the user. Good way to check the model hasn't completely derailed.
# Note that by default if a test run produces a negative token, it will stop the whole batch and move on to the next. (it will show lots of '!!!!' in Llama-3's case, as that's token ID 0)

my_model.generate("How much wood could a woodchuck chuck if a woodchuck could chuck wood?") # runs and prints the prompt!