In [1]:
# SET SEED
import torch
seed = 31
torch.manual_seed(seed)

# If using CUDA, set the seed for CUDA as well
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU.

    # For deterministic behavior in cuDNN (may slow down training)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [2]:
from transformer_lens import HookedTransformer

save_name = "qwen-1.5b"
save_name_to_idx_map = {
    "qwen-1.5b": 0,
    "qwen-7b": 1,
    "qwen-14b": 2,
}

name_to_model_map = {
    "qwen-1.5b": ["Qwen/Qwen2.5-Math-1.5B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"],
    "qwen-7b": ["Qwen/Qwen2.5-Math-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"],
    "qwen-14b": ["Qwen/Qwen2.5-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"],
    "qwen-32b": ["Qwen/Qwen2.5-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"],
}
device = "cuda:0"

# Load the models onto the proper device
base_model = HookedTransformer.from_pretrained(
    name_to_model_map[save_name][0],
    device=device
)

chat_model = HookedTransformer.from_pretrained(
    name_to_model_map[save_name][1],
    device=device
)
hook_point = f"blocks.{base_model.cfg.n_layers // 2}.hook_resid_pre"
weight_path = f"../checkpoints/version_{save_name_to_idx_map[save_name]}/{save_name}_13.pt"
weights = torch.load(weight_path, map_location="cpu")

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded pretrained model Qwen/Qwen2.5-Math-1.5B into HookedTransformer




Loaded pretrained model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B into HookedTransformer


  weights = torch.load(weight_path, map_location="cpu")


In [3]:
new_filename = "../results/" + weight_path[3:-3].replace("/","__") + f"_wait.json"

import json
with open(new_filename, 'r') as f:
    data = json.load(f)


active_features = {int(key): data[key] for key in data.keys() if data[key] > 0}
sorted_dict = dict(sorted(active_features.items(), key=lambda item: -item[1]))
print(sorted_dict)

{1963: 3565, 2200: 3565, 2254: 3565, 2394: 3565, 2750: 3565, 3864: 3565, 3933: 3565, 4167: 3565, 5213: 3565, 5668: 3565, 6939: 3565, 7137: 3565, 7415: 3565, 9706: 3565, 9833: 3565, 9836: 3565, 11765: 3565, 12140: 3565, 12458: 3565, 12581: 3565, 12767: 3565, 12778: 3565, 13368: 3565, 13559: 3565, 13876: 3565, 14753: 3565, 14809: 3565, 14900: 3565, 15524: 3565, 15828: 3565, 16436: 3565, 16625: 3565, 16752: 3565, 16932: 3565, 17007: 3565, 17140: 3565, 17545: 3565, 18748: 3565, 19083: 3565, 20150: 3565, 20753: 3565, 23338: 3565, 25235: 3565, 26602: 3565, 26761: 3565, 28595: 3565, 29804: 3565, 30388: 3565, 31018: 3565, 31566: 3565, 31733: 3565, 31861: 3565, 31979: 3565, 15130: 3564, 24507: 3564, 30845: 3564, 5525: 3563, 12903: 3563, 29192: 3563, 21583: 3560, 28787: 3556, 31744: 3551, 23378: 3548, 11715: 3546, 8660: 3542, 12125: 3537, 25164: 3536, 30487: 3527, 92: 3526, 19857: 3525, 12592: 3517, 22321: 3516, 16230: 3512, 19661: 3509, 1660: 3501, 7508: 3480, 7385: 3478, 28415: 3476, 22747: 34

In [4]:
# Self-correction: 4975, 8602
# Deductive: 23569, 27603
feat_idx = 23569
base_dec_steering = weights["W_dec"][feat_idx, 0, :].unsqueeze(0).to(device)
reasoning_dec_steering = weights["W_dec"][feat_idx, 1, :].unsqueeze(0).to(device)
print(base_dec_steering.shape, reasoning_dec_steering.shape)


torch.Size([1, 1536]) torch.Size([1, 1536])


In [5]:
print(torch.norm(base_dec_steering))
print(torch.norm(reasoning_dec_steering))

tensor(0.0815, device='cuda:0', dtype=torch.bfloat16)
tensor(0.2070, device='cuda:0', dtype=torch.bfloat16)


In [71]:
torch.cuda.empty_cache()
base_model.reset_hooks()
chat_model.reset_hooks()
    
#random_tensor = torch.randn_like(base_dec_steering)

# define the activation steering funtion
def act_add(steering_vec):
    def hook_fn(activation, hook):
        return activation + steering_vec
    return hook_fn


# generate text while steering
if feat_idx == 4975:
    test_sentence = "What is the next number in the sequence: 1, 2, 3?\n"
elif feat_idx == 23569:
    test_sentence = "Solve for x, y, and z in the system: x + y + z = 6, xy + yz + zx = 11, and xyz = 6.\n"
#    test_sentence = "If a fair coin is tossed 3 times, what is the probability of obtaining exactly 2 heads?\n"


base_model.add_hook(name=hook_point, hook=act_add(base_dec_steering))
out_text = base_model.generate(test_sentence, max_new_tokens=2000)
with open(f'base_steered_{feat_idx}.txt', 'w') as f:
    f.write(out_text)
print("-"*20)
chat_model.add_hook(name=hook_point, hook=act_add(reasoning_dec_steering))
out_text = chat_model.generate(test_sentence, max_new_tokens=2000)
with open(f'reasoning_steered_{feat_idx}.txt', 'w') as f:
    f.write(out_text)

  0%|          | 0/2000 [00:00<?, ?it/s]

 11%|█         | 219/2000 [00:09<01:13, 24.33it/s]


--------------------


 72%|███████▏  | 1446/2000 [00:59<00:22, 24.10it/s]


In [72]:
base_model.reset_hooks()
chat_model.reset_hooks()

# define the activation steering funtion
def act_add(steering_vec):
    def hook_fn(activation, hook):
        return activation
    return hook_fn

base_model.add_hook(name=hook_point, hook=act_add(base_dec_steering))
out_text = base_model.generate(test_sentence, max_new_tokens=2000)
with open(f'base_original_{feat_idx}.txt', 'w') as f:
    f.write(out_text)

chat_model.add_hook(name=hook_point, hook=act_add(reasoning_dec_steering))
out_text = chat_model.generate(test_sentence, max_new_tokens=2000)
with open(f'reasoning_original_{feat_idx}.txt', 'w') as f:
    f.write(out_text)


 28%|██▊       | 567/2000 [00:23<00:58, 24.29it/s]
 22%|██▏       | 437/2000 [00:17<01:03, 24.67it/s]


In [6]:
torch.cuda.empty_cache()
base_model.reset_hooks()
chat_model.reset_hooks()
    
#random_tensor = torch.randn_like(base_dec_steering)

# define the activation steering funtion
def act_add(steering_vec):
    def hook_fn(activation, hook):
        return activation + steering_vec
    return hook_fn


# generate text while steering
if feat_idx == 4975:
    test_sentence = "What is the next number in the sequence: 1, 2, 3?\n"
elif feat_idx == 23569:
    test_sentence = "If a fair coin is tossed 3 times, what is the probability of obtaining exactly 2 heads?\n"


base_model.add_hook(name=hook_point, hook=act_add(base_dec_steering))
out_text = base_model.generate(test_sentence, max_new_tokens=2000)
with open(f'base_steered_{feat_idx}_2.txt', 'w') as f:
    f.write(out_text)
print("-"*20)
chat_model.add_hook(name=hook_point, hook=act_add(reasoning_dec_steering))
out_text = chat_model.generate(test_sentence, max_new_tokens=2000)
with open(f'reasoning_steered_{feat_idx}_2.txt', 'w') as f:
    f.write(out_text)

 14%|█▍        | 278/2000 [00:12<01:14, 23.11it/s]


--------------------


 53%|█████▎    | 1059/2000 [00:44<00:39, 23.79it/s]


In [7]:
base_model.reset_hooks()
chat_model.reset_hooks()

# define the activation steering funtion
def act_add(steering_vec):
    def hook_fn(activation, hook):
        return activation
    return hook_fn

base_model.add_hook(name=hook_point, hook=act_add(base_dec_steering))
out_text = base_model.generate(test_sentence, max_new_tokens=2000)
with open(f'base_original_{feat_idx}_2.txt', 'w') as f:
    f.write(out_text)

chat_model.add_hook(name=hook_point, hook=act_add(reasoning_dec_steering))
out_text = chat_model.generate(test_sentence, max_new_tokens=2000)
with open(f'reasoning_original_{feat_idx}_2.txt', 'w') as f:
    f.write(out_text)


 11%|█         | 217/2000 [00:09<01:14, 23.85it/s]
100%|██████████| 2000/2000 [01:23<00:00, 23.94it/s]
