In [11]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os

from src.datahandlers import ActivatingDataset
from src.utils import tuple_str_to_tuple
from src.neuron_explain import generate_explanation_prompt_dict

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# filename = "2023-10-12_14-43-12_gpt2-small"
# filename = "2023-10-12_14-23-51_gpt2-medium"
# filename = "2023-10-12_14-41-45_gpt2-large"
# filename = "2023-10-14_16-00-09_gpt2-large_mid"

# filename = "2024-02-15_18-11-01_gpt2-large_train_30p" # gpt2-large 30 prompts example
# filename = "2024-02-15_18-31-43_gpt2-large_train_20p" # gpt2-large 20 prompts example
# filename = "2024-02-15_18-41-56_gpt2-large_train_10p" # gpt2-large 10 prompts example

# filename = "2024-02-16_05-16-44_gpt2-large_train_random" # gpt2-large random neurons example

# filename = "2024-02-15_01-59-23_pythia-1.4b_train"
# filename = "2024-02-15_02-19-16_pythia-410m_train" # need fr
# filename = "2024-02-15_02-22-40_pythia-160m_train"
# filename = "2024-02-16_07-35-30_pythia-160m_train_fr_acl" # 160m fr acl (20p)



with open(f'../experiment_data/4_head_attributions/{filename}.json') as f:
    head_attributions = json.load(f)

trimmed_texts_filename = head_attributions['prior_filename']
with open(f'../experiment_data/3_trimmed_texts/{trimmed_texts_filename}.json') as f:
    trimmed_texts = json.load(f)

max_activating_filename = trimmed_texts['prior_filename']
with open(f'../experiment_data/2_max_activating_texts/{max_activating_filename}.json') as f:
    max_activating = json.load(f)

neuron_filename = max_activating['prior_filename']
with open(f'../experiment_data/1_next_token_neurons/{neuron_filename}.json') as f:
    neurons_data = json.load(f)

neurons = [tuple_str_to_tuple(neuron_str) for neuron_str in head_attributions['head_attributions'].keys()]

neuron_to_token = {tuple_str_to_tuple(neuron_str): token_data['token'] for neuron_str, token_data in neurons_data['neurons'].items()}

In [18]:
from datasets import load_dataset
dataset = load_dataset("NeelNanda/pile-10k", split="train")
dataset_text_list = [x['text'] for x in dataset]

data = ActivatingDataset(trimmed_texts['neuron_to_trunc_data'], dataset)
data.remove_prompts_longer_than(100)

In [19]:
gpt_4_prompts_dict, nh_to_pos_neg_prompts = generate_explanation_prompt_dict(
    head_attribution_dict={tuple_str_to_tuple(k):v for k,v in head_attributions['head_attributions'].items()},
    neurons=neurons,
    neuron_to_token=neuron_to_token
)

In [20]:
lengths = []

for v in nh_to_pos_neg_prompts.values():
    lengths.append(len(v[0])+len(v[1]))

mean_length = sum(lengths)/len(lengths)
print(mean_length)

19.65068493150685


In [6]:
nh_to_pos_neg_prompts.keys()

dict_keys([(31, 3621, 367), (31, 3621, 410), (31, 3621, 123), (31, 3621, 517), (31, 3621, 360), (31, 3621, 559), (31, 364, 254), (31, 364, 304), (31, 364, 344), (31, 364, 367), (31, 364, 384), (31, 364, 401), (31, 364, 411), (31, 364, 453), (31, 364, 485), (31, 364, 491), (31, 364, 503), (31, 364, 516), (31, 364, 602), (31, 364, 320), (31, 364, 329), (31, 364, 493), (31, 364, 523), (31, 364, 106), (31, 364, 123), (31, 364, 177), (31, 364, 281), (31, 364, 319), (31, 364, 409), (31, 364, 470), (31, 364, 478), (31, 364, 492), (31, 364, 519), (31, 364, 145), (31, 364, 548), (31, 364, 525), (31, 364, 568), (31, 2918, 341), (31, 2918, 376), (31, 2918, 406), (31, 2918, 430), (31, 2918, 433), (31, 2918, 466), (31, 2918, 483), (31, 2918, 504), (31, 2918, 558), (31, 2918, 60), (31, 2918, 410), (31, 2918, 496), (31, 2918, 525), (31, 2918, 576), (31, 2918, 401), (31, 2918, 411), (31, 2918, 428), (31, 2918, 453), (31, 2918, 472), (31, 2918, 384), (31, 4378, 365), (31, 4378, 496), (31, 4378, 527), (

In [21]:
parameters = head_attributions['parameters']
model_name = parameters['model_name']

output = {
    'parameters': parameters,
    'nh_to_pos_neg_prompts': {str(k): v for k,v in nh_to_pos_neg_prompts.items()},
    'prior_filename': filename,
}

# Save json to ../experiment_data/2_max_activating_texts
timestamp = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(int(time.time())))
new_filename = f"{timestamp}_{model_name}_acl.json"

with open(f'../experiment_data/4_head_attributions/{new_filename}', 'w') as f:
    json.dump(output, f)

In [22]:
jobs = [
            {"model":"gpt-4-1106-preview",
            "messages":[{"role": "user", "content": gpt_4_prompt}],
            "max_tokens":200, 
        } for gpt_4_prompt in gpt_4_prompts_dict.values()]

filepath = f"../experiment_data/5_head_explanations/{filename}_acl.jsonl"
if os.path.isfile(filepath):
    raise Exception("File already exists!")

with open(filepath, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

In [24]:
gpt_4_prompts_dict_str = {str(k):v for k,v in gpt_4_prompts_dict.items()}
with open(f"../experiment_data/5_head_explanations/{filename}_prompts_dict_acl.json", "w") as f:
    json.dump(gpt_4_prompts_dict_str, f)

In [46]:
sum(prompt_lengths) * 0.01/1000

5.2131099999999995

In [47]:
prompts = list(gpt_4_prompts_dict.values())

string = ""
for prompt in prompts:
    string += prompt + "\n\n"

with open(f"../experiment_data/5_head_explanations/{filename}_prompts.txt", "w") as f:
    f.write(string)
    

In [23]:
import tiktoken

prompts = list(gpt_4_prompts_dict.values())
encoder = tiktoken.encoding_for_model("gpt-4")
encoded_prompts = [encoder.encode(prompt) for prompt in prompts]

sum([len(x) for x in encoded_prompts]) * 0.01/1000

1.4616600000000002