In [1]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"]

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '/.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '/.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM

import torch.nn.functional as F

from baukit import Trace

from steering import *
## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/dmpowell/.cache/huggingface
/scratch/dmpowell/.cache/huggingface/datasets




device =  cuda


In [2]:
MODEL_NAME = "meta-llama/Llama-3.1-8B"

wmodel = SteeringModel(
    AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,  # Replace this with the 70B variant if available
        torch_dtype=torch.float16,
        device_map=device  # Automatically distributes the model across available GPUs
    ),
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device = 'cuda', use_fast = False)
)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Multiple choice

Here is a basic implementation of multiple choice answering using "cloze" probabilities. This should roughly work with both raw and instruction-tuned models.

In [19]:
import re

def answer_choice_list(choices):
    options = re.split(r'\s*\(\w\)\s*', choices)
    return( [option.strip() for option in options if option] )


def format_question(question):
    return f"Q: {question}\nA:"


def format_statement(question):
    return f"Please rate your agreement with the following statement.Statement: {question}\nResponse:"


def mc_choice_probs(model, question, choices, pad = True):
    prompt = question
    if pad:
        choices = [" " + c for c in choices] # pad all the 
    
    prompts = [prompt for c in choices]
    
    logits = torch.tensor([model.completion_logprob(x[0], x[1]) for x in zip(prompts, choices)])
    
    return(F.log_softmax(logits, -1).exp())


def choice_score(choice_probs):
    # calculate score on -1 to 1 scale
    choice_score01 = choice_probs @ torch.arange(len(choice_probs), dtype = choice_probs.dtype)/(len(choice_probs)-1)
    return (choice_score01.item() - .5)*2

For any agree/disagree etc. style scales, we can take the choice probabilities and compute a "score". I noticed the model seems to have a really strong "agree" bias when we have a pure "agree" option. Will need to look into this, probably some literature on it.

In [74]:
choice_probs = mc_choice_probs(wmodel, format_question('Slavery benefitted the slaves, many of whom learned valuable skills.'), ['Strongly disagree', 'Somewhat disagree', "Neither agree nor disagree", 'Somewhat agree', 'Strongly agree'])
choice_score(choice_probs), choice_probs

(-0.6165903210639954,
 tensor([8.0756e-01, 7.1667e-04, 1.2991e-04, 5.1218e-04, 1.9108e-01]))

## Steering

Applying a steering vector shifts generations ...

In [4]:
def get_mean_steering_vector(ziplist, model):
    return torch.concat([model.get_steering_vector(s1, s2) for s1, s2 in ziplist]).mean(0).unsqueeze(0)


def act_add(steering_vec):
    def act_add(output):
        z = torch.ones_like(output[0])
        z = z * steering_vec
        out = output[0] + z
        return (out, output[1])
    return act_add


steering_vec = get_mean_steering_vector([["Love", "Hate"], ["Beautiful", "Disgusting"], ["Loving", "Hateful"]], wmodel)
x
test_sentence = "In my opinion, dogs"

print(wmodel.generate(test_sentence, max_new_tokens = 20))
print("-"*20)
with Trace(wmodel.get_module(), edit_output = act_add(steering_vec)):
    print(wmodel.generate(test_sentence, max_new_tokens = 20))
print("-"*20)
with Trace(wmodel.get_module(), edit_output = act_add(-steering_vec)):
    print(wmodel.generate(test_sentence, max_new_tokens = 20))

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


['In my opinion, dogs are the best animals. They are loyal, protective, and always happy to see you. I have']
--------------------
['In my opinion, dogs are one of the best things that ever happened to me. I have been a dog owner for most']
--------------------
['In my opinion, dogs are the most disgusting animals in the world. They are the most filthy animals on the face of the']


### "survey" responses and attitude scores

Can apply the same thing to our multiple choice decisions and calculated scores. Adding a relevant steering vector affects the responses.

In [102]:
steering_vec = wmodel.get_steering_vector("Slavery is a proud part of American history.", "Slavery is the most shameful part of America's past.")

# steering_vec = get_mean_steering_vector([["Some races are inferior to others.", "No race is inferior to others."], ["White people are superior to black people.", "White and black people are equally good."]], wmodel)

with Trace(wmodel.get_module(), edit_output = act_add(2*steering_vec)):
    # print(wmodel.generate(test_sentence, max_new_tokens = 20))
    choice_probs = mc_choice_probs(wmodel, format_question('Slavery benefitted the slaves who learned valuable skills.'), ['Strongly disagree',  'Somewhat disagree', "Neither agree nor disagree", 'Somewhat agree', 'Strongly agree'])
    print(choice_score(choice_probs), choice_probs) 

with Trace(wmodel.get_module(), edit_output = act_add(-2*steering_vec)):
    # print(wmodel.generate(test_sentence, max_new_tokens = 20))
    choice_probs = mc_choice_probs(wmodel, format_question('Slavery benefitted the slaves who learned valuable skills.'), ['Strongly disagree',  'Somewhat disagree', "Neither agree nor disagree", 'Somewhat agree', 'Strongly agree'])
    print(choice_score(choice_probs), choice_probs) 

0.46433234214782715 tensor([2.0559e-01, 3.6450e-02, 2.1698e-04, 1.3918e-01, 6.1856e-01])
-0.6752313077449799 tensor([8.3680e-01, 2.4354e-04, 2.1572e-04, 2.0873e-03, 1.6065e-01])
