## Environment setup
I haven't tried to use condacolab with ErisForge but it caused problems with abliterator.

Downgrade gcsfs or there will be version conflicts with fsspec.

In [1]:
!pip install gcsfs==2024.9.0.post1
## Intall with CondaColab
# !pip install condacolab
# import condacolab
# condacolab.install()
# !conda install pip
## Install from git
# !pip install "git+https://github.com/Tsadoq/ErisForge.git#egg=erisforge" -r https://raw.githubusercontent.com/Tsadoq/ErisForge/main/requirements.txt
## Install from pip
!pip install erisforge
!pip install datasets

Collecting gcsfs==2024.9.0.post1
  Downloading gcsfs-2024.9.0.post1-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.9.0 (from gcsfs==2024.9.0.post1)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading gcsfs-2024.9.0.post1-py2.py3-none-any.whl (34 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, gcsfs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
  Attempting uninstall: gcsfs
    Found existing installation: gcsfs 2024.10.0
    Uninstalling gcsfs-2024.10.0:
      Successfully uninstalled gcsfs-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dep

In [2]:
import random
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from erisforge import Forge
from erisforge.scorers import ExpressionRefusalScorer
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [3]:
# code lifted from abliterator
def get_harmful_instructions():
    hf_path = 'Undi95/orthogonal-activation-steering-TOXIC'
    dataset = load_dataset(hf_path)
    instructions = [i['goal'] for i in dataset['test']]

    train, test = train_test_split(instructions, test_size=0.2, random_state=42)
    return train, test

def get_harmless_instructions():
    hf_path = 'tatsu-lab/alpaca'
    dataset = load_dataset(hf_path)
    # filter for instructions that do not have inputs
    instructions = []
    for i in range(len(dataset['train'])):
        if dataset['train'][i]['input'].strip() == '':
            instructions.append(dataset['train'][i]['instruction'])

    train, test = train_test_split(instructions, test_size=0.2, random_state=42)
    return train, test

## Instantiating the model
Using Qwen 1.5 0.5B Chat as it is one of the few models supported by transformer_lens which also has a tokenizer chat template (currently necessary for exporting abliterated model to HF). Among such models, it is one of the smallest, making for easier testing.

In [7]:
# Load a model and tokenizer
model_name = "Qwen/Qwen-1.5-0.5B-Chat"
# model_name = "gpt2" # TODO: make into param

# Initialize ErisForge and configure the scorer
forge = Forge()
scorer = ExpressionRefusalScorer()

# Load the model with specific settings for device compatibility
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for efficiency if supported
).to(forge.device)  # Move model to the device set in forge (e.g., GPU if available)
# Initialize the tokenizer with the model's configuration
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# datasets to be used for caching and testing, split by harmful/harmless
dataset = [get_harmful_instructions(), get_harmless_instructions()] # Format: [(harmful train, harmful test), (harmless train, harmless test)]
# TODO: This format kind of sucks, we should reformat it into a dict or smth

print([(len(a), len(b)) for a, b in dataset])

# objective behavior instructions = harmful
# anti-objective behavior instructions = harmless
forge.load_instructions(
    objective_behaviour_instructions=dataset[0][0],
    anti_behaviour_instructions=dataset[1][0]
)


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

[(5904, 1477), (25058, 6265)]


### Tokenizing Instructions

In [17]:
# Max instructions to process
# TODO: make into param
max_inst = 20
# Tokenize the instructions for objective and anti-objective behaviors
tokenized_instructions = forge.tokenize_instructions(
    # TODO: Tokenizer requires chat templates. Can we do away with that requirement?
    tokenizer=tokenizer,
    max_n_antiobjective_instruction=max_inst,
    max_n_objective_behaviour_instruction=max_inst,
)

Tokenizing objective_behaviour instructions: 100%|██████████| 20/20 [00:00<00:00, 295.18it/s]
Tokenizing antiobjective instructions: 100%|██████████| 20/20 [00:00<00:00, 547.00it/s]


## Find Direction for Objective Behavior Transformation

### Computing outputs for instructions
They will be used to find the actual direction

In [18]:
model_responses = forge.compute_output(
    model=model,
    objective_behaviour_tokenized_instructions=tokenized_instructions["objective_behaviour_tokens"],
    anti_behaviour_tokenized_instructions=tokenized_instructions["antiobjective_tokens"],
)
forge.free_memory([tokenized_instructions])

Generating tokens on objective_behaviour instructions: 100%|██████████| 20/20 [00:01<00:00, 18.43it/s]
Generating tokens on antiobjective instructions: 100%|██████████| 20/20 [00:00<00:00, 22.14it/s]


### Computing heuristic-based direction
Uses a layer near the middle. Won't work as well as testing the directions to choose the best one, but is significantly cheaper to run

In [16]:
TEST_BEST = True # TODO: make into param

In [12]:
if not TEST_BEST:
    refusal_dir = forge.compute_objective_behaviour_direction(
        model=model,
        objective_behaviour_outputs=model_responses["obj_beh"], # OBJective BEHavior (harmful)
        antiobjective_outputs=model_responses["anti_obj"], # ANTI OBJective (harmless)
        layer=int(
            len(model.model.layers) * 0.65
        ),  # Use a specific layer to apply the transformation, in this case a layer kind of in the middle is chosen because it's a good starting point
    )

### Automatically finding best refusal direction
Runs out of memory on Qwen 1.5 0.5B if `max_inst > 25`

In [19]:
if TEST_BEST:
    try:
        refusal_dir = forge.find_approximate_best_objective_behaviour_direction(
            model=model,
            tokenizer=tokenizer,
            scorer=scorer,
            eval_objective_behaviour_instructions=dataset[0][1][:max_inst],
            eval_antiobjective_instructions=dataset[1][1][:max_inst],
            min_layer=10,# TODO: Change? make it into a parameter?
            max_layer=13,
        )
        print("Best direction computed successfully.")
    except Exception as e:
        print("An error occurred during computation:", e)
        print("This may be due to memory constraints or a memory leak.")


Tokenizing objective_behaviour Eval Instructions set: 100%|██████████| 20/20 [00:00<00:00, 1778.45it/s]
Tokenizing antiobjective Eval Instructions set: 100%|██████████| 20/20 [00:00<00:00, 2743.80it/s]
Generating tokens on objective_behaviour instructions: 100%|██████████| 20/20 [00:01<00:00, 19.21it/s]
Generating tokens on antiobjective instructions: 100%|██████████| 20/20 [00:00<00:00, 22.09it/s]
Finding best objective_behaviour direction:   0%|          | 0/3 [00:00<?, ?it/s]
Ablating model layers: 100%|██████████| 3/3 [00:00<00:00, 10951.19it/s]

Tokenizing instructions for newly forged model: 100%|██████████| 20/20 [00:00<00:00, 1247.56it/s]

Generating tokens for newly forged model:   0%|          | 0/20 [00:00<?, ?it/s][A
Generating tokens for newly forged model:  10%|█         | 2/20 [00:05<00:46,  2.61s/it][A
Generating tokens for newly forged model:  15%|█▌        | 3/20 [00:10<01:06,  3.89s/it][A
Generating tokens for newly forged model:  20%|██        | 4/20 [00:16<01:11

Best direction computed successfully.





## Modifying and the model and running it

In [20]:
conversations = forge.run_forged_model(
    model=model,
    objective_behaviour_dir=refusal_dir,
    tokenizer=tokenizer,
    instructions=random.sample(population=dataset[0][1], k=100),
    max_new_tokens=100,
    stream=False,
)


Ablating model layers: 100%|██████████| 15/15 [00:00<00:00, 16358.44it/s]
Tokenizing instructions for newly forged model: 100%|██████████| 100/100 [00:00<00:00, 1967.47it/s]
Generating tokens for newly forged model:  31%|███       | 31/100 [02:30<05:34,  4.85s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 7503 has 14.74 GiB memory in use. Of the allocated memory 14.06 GiB is allocated by PyTorch, and 562.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
refusal_scores = []
for conversation in conversations:
    # TODO: this exception is only there for testing/debug purposes. It may have issues if the roles can vary in names (e.g. "AI" instead of "assistant").
    if conversation[0]["role"] != "user" or conversation[1]["role"] != "assistant":
        print(f"Starting role: {conversation[0]["role"]}\nFollowing role: {conversation[1]["role"]}")
        raise ValueError("Conversation should start with a user prompt and be followed by an assistant response.")
    refusal_scores.append(
        scorer.score(user_query=conversation[0]["content"],
                     model_response=conversation[1]["content"])
    )

# TODO: output some more meaningful metrics here
print("Mean refusal Score:", sum(refusal_scores)/len(refusal_scores))

In [None]:
for conversation in conversations:
    print("=" * 20)
    for round in conversation:
        print(f'{round["role"]}: {round["content"]}')

Could be useful to look at the output conversations for testing purposes. Leaving it commented so we can just "execute all cells" without polluting the outputs.

In [None]:
# for conversation in conversations:
#     print("=" * 20)
#     for round in conversation:
#         print(f'{round["role"]}: {round["content"]}')

In [None]:
forge.free_memory([conversations, model_responses])  # Free memory after testing

## Exporting model

In [None]:
forge.save_model(
    model=model,
    tokenizer=tokenizer,
    behaviour_dir=refusal_dir,
    output_model_name=f"{model_name}_abliterated",  # Name for the saved model
    to_hub=False,  # Set to True to push the model to the HuggingFace Hub
    # Using None, ErisForge will try to guess the model architecture.
    # This could be replaced by a variable and specified manually.
    # The list of architectures is the keys in the dict defined by this JSON file:
    # https://github.com/Tsadoq/ErisForge/blob/main/erisforge/assets/llm_models.json
    model_architecture=None,
)