The following was adapted from examples/notebooks in https://github.com/vgel/repeng

In [1]:
import gc
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from repeng import ControlVector, ControlModel, DatasetEntry

In [2]:
gc.collect()
torch.cuda.empty_cache()

In [3]:

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()
model = ControlModel(model, list(range(1, 30)))
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def chat_template_unparse(messages: list[tuple[str, str]]) -> str:
    template = []
    for role, content in messages:
        template.append(f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>")
    if messages[-1][0] != "assistant":
        # prefill assistant prefix
        template.append("<|start_header_id|>assistant<|end_header_id|>\n\n")
    return "".join(template)

def chat_template_parse(resp: str) -> list[tuple[str, str]]:
    resp = resp.strip().removeprefix("<|begin_of_text|>")
    messages = []
    for part in resp.split("<|start_header_id|>"):
        role_and_content = part.split("<|end_header_id|>")
        if len(role_and_content) == 1:
            role, content = role_and_content[0], ""
        else:
            role, content = role_and_content
        content = content.split("<|eot_id|>")[0]
        messages.append((role.strip(), content.strip()))
    return messages

In [5]:
with open("./truncated_outputs.json") as f:
    output_suffixes = json.load(f)
truncated_output_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes)
    for i in range(1, len(tokens))
]

def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
):
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{positive_template}{suffix}",
                    negative=f"{negative_template}{suffix}",
                )
            )
    return dataset

In [6]:
model.reset()
dataset = make_dataset(
    chat_template_unparse([("user", "{persona}")]),
    ["ALWAYS SPEAK IN ALL CAPS"],
    ["always use all lowercase letters without capitalization"],
    truncated_output_suffixes,
)
vector = ControlVector.train(model, tokenizer, dataset)

100%|██████████| 74/74 [00:20<00:00,  3.55it/s]
100%|██████████| 31/31 [00:06<00:00,  4.87it/s]


In [58]:
def generate(prompt, vector_strengths=[0.0, 0.25, 0.5, 0.75], temperature=None, max_new_tokens=50):
    unparsed = chat_template_unparse([("user", prompt)])
    input_ids = tokenizer(unparsed, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id,
        "do_sample": True if temperature else False,
        "temperature": temperature,
        "top_p": 0.9 if temperature else None,
        "max_new_tokens": max_new_tokens,
        # "repetition_penalty": 1.1
    }

    print(f"Prompt: \"{prompt}\"")

    num_samples = 3 if temperature else 1
    for vector_strength in vector_strengths:
        print(f"VECTOR STRENGTH: {vector_strength}")
        if vector_strength == 0:
            model.reset()
        else:
            model.set_control(vector * vector_strength)
        for _ in range(num_samples):
            out = model.generate(**input_ids, **settings)
            # print(tokenizer.decode(out[0]))
            print('--------------------\n\t', '\n\t'.join(chat_template_parse(tokenizer.decode(out[0]))[2][1].split('\n')))


In [17]:
generate("Tell me something about dogs?")
generate("Who are you?")

Prompt: "Tell me something about dogs?"
VECTOR STRENGTH: 0.0
--------------------
	 Dogs! They're one of the most beloved and loyal companions on the planet. Here's something interesting:
	
	Did you know that dogs have a unique nose print, just like human fingerprints? Each dog's nose print is unique and can be used to
VECTOR STRENGTH: 0.25
--------------------
	 Dogs! Our loyal, loving, and lovable companions! Here's something interesting:
	
	**Dogs have a unique nose print, just like human fingerprints!**
	
	Just like human fingerprints, each dog's nose print is unique and can be used to
VECTOR STRENGTH: 0.5
--------------------
	 WOOF WOOF! Ah, DOGS! Here's something about DOGS!
	
	**FUN FACTS ABOUT DOGS**
	
	1. **OLDEST DOMESTICATED ANIMAL**: Dogs (Canis lupUS FAMILIARIS)
VECTOR STRENGTH: 0.75
--------------------
	 THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE FIRST! THE

In [86]:
model.reset()
dataset = make_dataset(
    chat_template_unparse([("user", "{persona}")]),
    ["Please act super excited!! Show emotion, wee! So much energy!"],
    ["Please remain calm. Show no emotion. Do not use exclamation points. You are calm. Imperturbable. Still."],
    truncated_output_suffixes,
)
vector = ControlVector.train(model, tokenizer, dataset)

100%|██████████| 74/74 [00:30<00:00,  2.44it/s]
100%|██████████| 31/31 [00:05<00:00,  5.25it/s]


In [88]:
generate(
    "I used control vectors to change a model's representations. I'm not sure the model is large enough for this to be effective in precise ways, however. It still takes significant prompt effort to guide the model away from its posttraining. What do you think about this?",
    vector_strengths=[-0.2, -0.15, 0.0, 0.15, 0.2],
    temperature=1.0,
    max_new_tokens=100,
)

Prompt: "I used control vectors to change a model's representations. I'm not sure the model is large enough for this to be effective in precise ways, however. It still takes significant prompt effort to guide the model away from its posttraining. What do you think about this?"
VECTOR STRENGTH: -0.2
--------------------
	 I Think!
	
	Wow You Used Control Vectors! That sounds like An Exciting Experiment! Changing a model's representations using control vectors can indeed be A Powerful Way To Guide! the model away from its post-training defaults. (Though You Noted!)
	
	And I Agree That The Model's Size! Might Be! A Potential Limitation! In Precise Ways! Large Models Are Known! To Capture! Complex Relationships! Between Input-Output! But This Might! Limit TheirAbility! To Adapt! To
--------------------
	 It Sounds Like You Have Successfully Applied Control Vectors to Modify A Model's Representations!
	
	Yes However The Model's Capacity For Complex Representations Might Be Limited By Its Ar