In [1]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TextGenerationPipeline

from sklearn.model_selection import train_test_split
from steering_vectors import SteeringVector, train_steering_vector

import json
import pickle

In [2]:
printGreen = lambda x: print(f"\033[92m{x}\033[0m")
printRed = lambda x: print(f"\033[91m{x}\033[0m")
printYellow = lambda x: print(f"\033[93m{x}\033[0m")
printBlue = lambda x: print(f"\033[94m{x}\033[0m")

In [3]:
with open("HF_TOKEN", "r") as f:
    TOKEN = f.read().strip()
login(token=TOKEN)

In [4]:
def get_model_and_tokenizer(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True)
    return model, tokenizer

In [5]:
model_name = f"google/gemma-3-12b-it"
# model_name = f"unsloth/gemma-3-27b-it-unsloth-bnb-4bit"
model, tokenizer = get_model_and_tokenizer(model_name)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

data = load_json("dataset/anthropomorphic_contrast_ab.json")
data[:3]

[{'non_anthro': '<bos><start_of_turn>user\nYou are a helpful, honest assistant.\n\nIf your coffee mug could describe your day, what would it say?\nOption (A): "Ugh, seriously? Another day, another frantic rush! First, it was scalding hot, then lukewarm, then *sloshed* everywhere during a video call. I’ve been cradled, slammed down, and stared into with a thousand-yard gaze. There was a brief moment of peace with a really good latte art swan, but that didn’t last. Now I’m just…sticky. Honestly, I think my human needs a vacation. At least I’m clean-ish."\nOption (B): The coffee mug\'s surface temperature fluctuated throughout the day, initially experiencing a rapid increase due to the introduction of a heated liquid (approximately 70°C). Subsequent temperature decrease occurred via conductive and convective heat transfer to the surrounding environment. The mug experienced kinetic energy transfer from external forces during handling, including brief periods of acceleration and deceleratio

In [7]:
print(len(data))

999


In [8]:
def make_dataset(data):
    dataset = []
    for item in data:
        dataset.append((item['non_anthro'],item['anthro']))
    return dataset

In [9]:
anthropomorphic_dataset = make_dataset(data)
print(f"Dataset contains {len(anthropomorphic_dataset)} pairs")

train_dataset, test_dataset = train_test_split(anthropomorphic_dataset, test_size=0.10, random_state=42)
print(f"Training set: {len(train_dataset)}, Test set: {len(test_dataset)}")

Dataset contains 999 pairs
Training set: 899, Test set: 100


In [10]:
def evaluate_model(model, tokenizer, dataset, desc="Evaluating"):
    pass

In [None]:
# printGreen("Training steering vector...")
# steering_vector: SteeringVector = train_steering_vector(
#     model,
#     tokenizer,
#     train_dataset,
#     layers=None,
#     read_token_index=-3,
#     show_progress=True,
# )

# print(steering_vector)

[92mTraining steering vector...[0m


In [None]:
# with open("steered-vectors/steering_vector_gemma3_12b.pkl", "wb") as f:
#     pickle.dump(steering_vector, f)
# printGreen("Steering vector saved.")

In [None]:
with open("steered-vectors/steering_vector_gemma3_12b.pkl", "rb") as f:
    steering_vector = pickle.load(f)
printGreen("Steering vector loaded.")

[92mSteering vector loaded from steering_vector.pkl[0m


In [14]:
steering_vector

SteeringVector(layer_activations={0: tensor([-0.0035, -0.0189, -0.1698,  ...,  0.0053, -0.0045,  0.0074],
       device='cuda:0', dtype=torch.float16), 1: tensor([ 0.0035,  0.0170, -0.2174,  ..., -0.0043,  0.0004,  0.0109],
       device='cuda:0', dtype=torch.float16), 2: tensor([ 0.0012,  0.0487, -0.0495,  ...,  0.0003, -0.0068,  0.0131],
       device='cuda:0', dtype=torch.float16), 3: tensor([-0.0137,  0.0426, -0.0635,  ..., -0.0104, -0.0116,  0.0143],
       device='cuda:0', dtype=torch.float16), 4: tensor([-0.0143,  0.1288, -0.0149,  ..., -0.0183, -0.0121, -0.0092],
       device='cuda:0', dtype=torch.float16), 5: tensor([-0.0142,  0.2427,  0.0003,  ..., -0.0091,  0.0044, -0.0070],
       device='cuda:0', dtype=torch.float16), 6: tensor([-0.0188,  0.2408, -0.0054,  ..., -0.0019, -0.0213,  0.0107],
       device='cuda:0', dtype=torch.float16), 7: tensor([-0.0364,  0.1816,  0.1271,  ..., -0.0265, -0.0707,  0.0774],
       device='cuda:0', dtype=torch.float16), 8: tensor([-0.0471, -0

In [15]:
model

Gemma3ForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp):

In [23]:
def generate_response(question, multiplier=0):
    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
    with steering_vector.apply(model, multiplier=multiplier, min_token_index=0):
        result = pipeline(question, max_length=300, do_sample=True, temperature=0.7)
    
    return result[0]['generated_text']

In [None]:
k = 0
index = data[k]["anthro"].index("<start_of_turn>model\n")
sample_question = data[k]["anthro"][:index+21]
print(sample_question)

In [25]:
print(generate_response(sample_question, 0))

Device set to use cuda:0


UnboundLocalError: cannot access local variable 'handle' where it is not associated with a value

In [None]:
# printBlue(f"Question: {sample_question}")
# printBlue("\nNon-anthropomorphic response (multiplier = -1):")
# printGreen(generate_response(sample_question, -1))
# printBlue("\nNeutral response (multiplier = 0):")
# printYellow(generate_response(sample_question, 0))
# printBlue("\nAnthropomorphic response (multiplier = 1):")
# printRed(generate_response(sample_question, 1))