In [7]:
%load_ext autoreload
%autoreload 2

In [1]:
import json
import time
import os

from src.neuron_heads import head_attribution_over_all_data
from src.datahandlers import ActivatingDataset
from src.utils import tuple_str_to_tuple
from src.neuron_explain import generate_explanation_prompt_dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# filename = "2023-10-12_14-43-12_gpt2-small"
# filename = "2023-10-12_14-23-51_gpt2-medium"
filename = "2023-10-12_14-41-45_gpt2-large"
# filename = "2023-10-14_16-00-09_gpt2-large_mid"

with open(f'../experiment_data/4_head_attributions/{filename}.json') as f:
    head_attributions = json.load(f)

trimmed_texts_filename = head_attributions['prior_filename']
with open(f'../experiment_data/3_trimmed_texts/{trimmed_texts_filename}.json') as f:
    trimmed_texts = json.load(f)

max_activating_filename = trimmed_texts['prior_filename']
with open(f'../experiment_data/2_max_activating_texts/{max_activating_filename}.json') as f:
    max_activating = json.load(f)

neuron_filename = max_activating['prior_filename']
with open(f'../experiment_data/1_next_token_neurons/{neuron_filename}.json') as f:
    neurons_data = json.load(f)

neurons = [tuple_str_to_tuple(neuron_str) for neuron_str in head_attributions['head_attributions'].keys()]

neuron_to_token = {tuple_str_to_tuple(neuron_str): token_data['token'] for neuron_str, token_data in neurons_data['neurons'].items()}

In [3]:
from datasets import load_dataset
dataset = load_dataset("NeelNanda/pile-10k", split="train")
dataset_text_list = [x['text'] for x in dataset]

data = ActivatingDataset(trimmed_texts['neuron_to_trunc_data'], dataset)
data.remove_prompts_longer_than(100)

In [10]:
gpt_4_prompts_dict, nh_to_pos_neg_prompts = generate_explanation_prompt_dict(
    head_attribution_dict={tuple_str_to_tuple(k):v for k,v in head_attributions['head_attributions'].items()},
    neurons=neurons,
    neuron_to_token=neuron_to_token
)

29
Counter({548: 11, 525: 7, 516: 6, 519: 6, 492: 4, 106: 4, 545: 4, 568: 4, 344: 3, 401: 2, 491: 2, 453: 2, 411: 2, 409: 2, 338: 2, 123: 2, 499: 2, 602: 1, 478: 1, 358: 1, 503: 1, 595: 1, 448: 1, 575: 1, 482: 1, 255: 1, 385: 1, 605: 1, 235: 1, 348: 1, 82: 1, 257: 1, 240: 1, 440: 1, 532: 1, 598: 1, 497: 1, 271: 1, 413: 1})
[548]


TypeError: cannot unpack non-iterable NoneType object

In [6]:
nh_to_pos_neg_prompts.keys()

dict_keys([(31, 3621, 468), (31, 3621, 410), (31, 3621, 496), (31, 3621, 538), (31, 3621, 517), (31, 3621, 88), (31, 364, 548), (31, 2918, 558), (31, 2918, 505), (31, 2918, 576), (31, 4378, 538), (31, 4378, 423), (31, 4378, 575), (31, 4378, 123), (31, 4378, 88), (31, 988, 525), (31, 988, 371), (31, 988, 492), (31, 2658, 538), (31, 2658, 492), (31, 2658, 525), (31, 2692, 492), (31, 2692, 466), (31, 2692, 559), (31, 2692, 504), (31, 4941, 538), (31, 4941, 466), (31, 4941, 492), (31, 2415, 525), (31, 2415, 548), (31, 2415, 60), (31, 2415, 88), (31, 2415, 492), (31, 1407, 123), (31, 1407, 60), (31, 1407, 88), (31, 3530, 538), (31, 3530, 466), (31, 3530, 360), (31, 3530, 492), (31, 4239, 401), (31, 4239, 468), (31, 4239, 491), (31, 4239, 88), (31, 4239, 123), (31, 3163, 527), (31, 3163, 501), (31, 3163, 537), (31, 3163, 601), (31, 3163, 606), (31, 4724, 88), (31, 4724, 123), (31, 4724, 145), (31, 1796, 517), (31, 1796, 492), (31, 1796, 430), (31, 1796, 531), (31, 4957, 466), (31, 4957, 492)

In [81]:
parameters = head_attributions['parameters']
model_name = parameters['model_name']

output = {
    'parameters': parameters,
    'nh_to_pos_neg_prompts': {str(k): v for k,v in nh_to_pos_neg_prompts.items()},
    'prior_filename': filename,
}

# Save json to ../experiment_data/2_max_activating_texts
timestamp = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(int(time.time())))
new_filename = f"{timestamp}_{model_name}_prompts_mid.json"

with open(f'../experiment_data/4_head_attributions/{new_filename}', 'w') as f:
    json.dump(output, f)

In [82]:
jobs = [
            {"model":"gpt-4",
            "messages":[{"role": "user", "content": gpt_4_prompt}],
            "max_tokens":200, 
        } for gpt_4_prompt in gpt_4_prompts_dict.values()]

filepath = f"../experiment_data/5_head_explanations/{filename}_prompts_mid.jsonl"
if os.path.isfile(filepath):
    raise Exception("File already exists!")

with open(filepath, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

In [83]:
gpt_4_prompts_dict_str = {str(k):v for k,v in gpt_4_prompts_dict.items()}
with open(f"../experiment_data/5_head_explanations/{filename}_prompts_dict.json", "w") as f:
    json.dump(gpt_4_prompts_dict_str, f)