## Setup

### Imports

In [1]:
import os
import json
import glob
import torch
import re
import einops
import pandas as pd
from functools import partial
from torch import Tensor
from torchtyping import TensorType as TT


import plotly.express as px

from utils.data_utils import generate_data_and_caches
from utils.data_processing import (
    load_edge_scores_into_dictionary,
)
from utils.visualization import plot_attention_heads, imshow_p
from utils.backup_analysis import (
    load_model,
    run_iteration,
    process_backup_results,
    get_past_nmhs_for_checkpoints,
    plot_top_heads
)

In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f7d29392a40>

### Functions

## Experiments

### Experiment Parameters

In [3]:
TASK = 'ioi'
PERFORMANCE_METRIC = 'logit_diff'
BASE_MODEL = "pythia-70m"
VARIANT = None #"EleutherAI/pythia-70m-weight-seed3"
MODEL_SHORTNAME = BASE_MODEL if not VARIANT else VARIANT[11:]
CACHE = "model_cache"
IOI_DATASET_SIZE = 70
COPY_SCORE_THRESHOLD = 75.0

### Circuit Data

In [4]:
folder_path = f'results/graphs/{MODEL_SHORTNAME}/{TASK}'
df = load_edge_scores_into_dictionary(folder_path)

# filter everything before 1000 steps
df = df[df['checkpoint'] >= 1000]

df[['source', 'target']] = df['edge'].str.split('->', expand=True)
len(df['target'].unique())

Processing file 1/143: results/graphs/pythia-70m/ioi/57000.json
                 edge     score  in_circuit  checkpoint
0     input->a0.h0<q>  0.001869       False       57000
1     input->a0.h0<k>  0.000444       False       57000
2     input->a0.h0<v>  0.000212       False       57000
3     input->a0.h1<q>  0.000359       False       57000
4     input->a0.h1<k> -0.000362       False       57000
...               ...       ...         ...         ...
3575    a5.h4->logits -0.000093       False       57000
3576    a5.h5->logits  0.002090       False       57000
3577    a5.h6->logits -0.000176       False       57000
3578    a5.h7->logits  0.001198       False       57000
3579       m5->logits  0.021729       False       57000

[3580 rows x 4 columns]
Processing file 2/143: results/graphs/pythia-70m/ioi/141000.json
                 edge     score  in_circuit  checkpoint
0     input->a0.h0<q>  0.001869       False       57000
1     input->a0.h0<k>  0.000444       False       57000
2     

151

### Dataset Setup

In [5]:
initial_model = load_model(BASE_MODEL, VARIANT, 143000, CACHE, device)
size=70
ioi_dataset, abc_dataset = generate_data_and_caches(initial_model, size, verbose=True)



tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer


In [6]:
# imshow_p(
#     per_head_ablated_logit_diffs,
#     title="Headwise logit diff contribution, post NMH KO",
#     labels={"x": "Head", "y": "Layer", "color": "Logit diff attribution"},
#     #coloraxis=dict(colorbar_ticksuffix = "%"),
#     border=True,
#     width=600,
#     margin={"r": 100, "l": 100}
# )

### Run Experiment

In [7]:
experiment_metrics = dict()
# create folder
os.makedirs(f'results/backup/{MODEL_SHORTNAME}', exist_ok=True)

for checkpoint in range(4000, 144000, 1000):

    experiment_metrics = run_iteration(
        BASE_MODEL, VARIANT, df, checkpoint=checkpoint, dataset=ioi_dataset, experiment_metrics=experiment_metrics, 
        threshold=COPY_SCORE_THRESHOLD
    )
    experiment_metrics = process_backup_results(df, checkpoint, experiment_metrics)

    # save to file, using pytorch format
    torch.save(experiment_metrics, f'results/backup/{MODEL_SHORTNAME}/nmh_backup_metrics.pt')



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 18.571428571428573%
Checkpoint 4000:
Heads ablated:            [(4, 6), (3, 6)]
Original logit diff:      -0.8775387406
Post ablation logit diff: -0.5249566436
Logit diff % change:      -40.18%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 30.476190476190478%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 38.095238095238095%
Checkpoint 5000:
Heads ablated:            [(4, 6)]
Original logit diff:      -0.9491387606
Post ablation logit diff: -0.2954781055
Logit diff % change:      -68.87%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 57.61904761904761%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 50.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 16.19047619047619%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 89.04761904761904%




Checkpoint 6000:
Heads ablated:            [(4, 6), (3, 1)]
Original logit diff:      -0.8585914969
Post ablation logit diff: -0.2592725456
Logit diff % change:      -69.80%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 54.285714285714285%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 27.61904761904762%




Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 59.523809523809526%
Checkpoint 7000:
Heads ablated:            [(4, 6)]
Original logit diff:      -0.2771649659
Post ablation logit diff: 0.0070780516
Logit diff % change:      -102.55%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 8000:
Heads ablated:            []
Original logit diff:      0.0115882810
Post ablation logit diff: 0.0115882810
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 9000:
Heads ablated:            []
Original logit diff:      0.2201550752
Post ablation logit diff: 0.2201550752
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 10000:
Heads ablated:            []
Original logit diff:      0.0404976159
Post ablation logit diff: 0.0404976159
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 11000:
Heads ablated:            []
Original logit diff:      0.3416843414
Post ablation logit diff: 0.3416843414
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 12000:
Heads ablated:            []
Original logit diff:      0.4083966017
Post ablation logit diff: 0.4083966017
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 13000:
Heads ablated:            []
Original logit diff:      0.3803787529
Post ablation logit diff: 0.3803787529
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 90.95238095238095%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 52.85714285714286%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 14000:
Heads ablated:            [(3, 5), (4, 6)]
Original logit diff:      0.6185692549
Post ablation logit diff: 1.0815589428
Logit diff % change:      74.85%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 92.38095238095238%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 57.14285714285714%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 15000:
Heads ablated:            [(3, 5), (4, 6)]
Original logit diff:      0.6759502888
Post ablation logit diff: 1.0873620510
Logit diff % change:      60.86%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 58.0952380952381%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 90.95238095238095%




Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 60.0%
Checkpoint 16000:
Heads ablated:            [(4, 6), (3, 5)]
Original logit diff:      0.7325052619
Post ablation logit diff: 1.1954573393
Logit diff % change:      63.20%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 17000:
Heads ablated:            []
Original logit diff:      0.7017971873
Post ablation logit diff: 0.7017971873
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 57.61904761904761%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 92.38095238095238%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 61.904761904761905%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 1.4285714285714286%




Checkpoint 18000:
Heads ablated:            [(3, 5), (4, 6)]
Original logit diff:      0.7491797209
Post ablation logit diff: 1.3264752626
Logit diff % change:      77.06%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 92.85714285714286%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 65.71428571428571%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 19000:
Heads ablated:            [(3, 5), (4, 6)]
Original logit diff:      0.7100559473
Post ablation logit diff: 1.2574195862
Logit diff % change:      77.09%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 20000:
Heads ablated:            []
Original logit diff:      0.7370791435
Post ablation logit diff: 0.7370791435
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 21000:
Heads ablated:            []
Original logit diff:      0.8281600475
Post ablation logit diff: 0.8281600475
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 22000:
Heads ablated:            []
Original logit diff:      0.8246081471
Post ablation logit diff: 0.8246081471
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 96.66666666666667%




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 73.80952380952381%
Checkpoint 23000:
Heads ablated:            [(4, 6), (3, 5)]
Original logit diff:      1.0021547079
Post ablation logit diff: 1.4895738363
Logit diff % change:      48.64%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 68.0952380952381%
Checkpoint 24000:
Heads ablated:            [(3, 5), (4, 6)]
Original logit diff:      0.5890020132
Post ablation logit diff: 1.0051034689
Logit diff % change:      70.65%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 25000:
Heads ablated:            []
Original logit diff:      0.7022525072
Post ablation logit diff: 0.7022525072
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 26000:
Heads ablated:            []
Original logit diff:      0.4500888288
Post ablation logit diff: 0.4500888288
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 27000:
Heads ablated:            [(3, 5), (4, 4), (4, 6)]
Original logit diff:      0.7189214826
Post ablation logit diff: 1.5496541262
Logit diff % change:      115.55%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 28000:
Heads ablated:            []
Original logit diff:      0.2834420502
Post ablation logit diff: 0.2834420502
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 77.14285714285715%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 99.52380952380952%




Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 29000:
Heads ablated:            [(4, 4), (3, 5), (4, 6)]
Original logit diff:      0.6070242524
Post ablation logit diff: 1.6267482042
Logit diff % change:      167.99%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 81.42857142857143%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 30000:
Heads ablated:            [(4, 4), (4, 6)]
Original logit diff:      1.2188258171
Post ablation logit diff: 2.4047615528
Logit diff % change:      97.30%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 76.66666666666667%




Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 31000:
Heads ablated:            [(4, 4), (4, 6)]
Original logit diff:      0.9041007161
Post ablation logit diff: 1.9113290310
Logit diff % change:      111.41%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 32000:
Heads ablated:            []
Original logit diff:      0.7107272148
Post ablation logit diff: 0.7107272148
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 83.33333333333334%
Checkpoint 33000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      0.7348417044
Post ablation logit diff: 1.8796205521
Logit diff % change:      155.79%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 34000:
Heads ablated:            []
Original logit diff:      0.7538008690
Post ablation logit diff: 0.7538008690
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 35000:
Heads ablated:            []
Original logit diff:      0.6723266840
Post ablation logit diff: 0.6723266840
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 36000:
Heads ablated:            []
Original logit diff:      0.8214648962
Post ablation logit diff: 0.8214648962
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 37000:
Heads ablated:            []
Original logit diff:      0.9073672891
Post ablation logit diff: 0.9073672891
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 83.33333333333334%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 38000:
Heads ablated:            [(4, 6), (4, 4), (3, 5)]
Original logit diff:      1.0821753740
Post ablation logit diff: 2.2819392681
Logit diff % change:      110.87%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 39000:
Heads ablated:            []
Original logit diff:      0.5315364599
Post ablation logit diff: 0.5315364599
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 81.42857142857143%
Checkpoint 40000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      0.7468996644
Post ablation logit diff: 2.1283068657
Logit diff % change:      184.95%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 41000:
Heads ablated:            []
Original logit diff:      0.6730849743
Post ablation logit diff: 0.6730849743
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 86.66666666666667%
Checkpoint 42000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      0.7323057652
Post ablation logit diff: 2.0341403484
Logit diff % change:      177.77%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 43000:
Heads ablated:            []
Original logit diff:      0.4700644314
Post ablation logit diff: 0.4700644314
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 44000:
Heads ablated:            []
Original logit diff:      0.4418857098
Post ablation logit diff: 0.4418857098
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 45000:
Heads ablated:            []
Original logit diff:      0.4903239608
Post ablation logit diff: 0.4903239608
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 46000:
Heads ablated:            []
Original logit diff:      0.6233853102
Post ablation logit diff: 0.6233853102
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 47000:
Heads ablated:            []
Original logit diff:      0.7634024620
Post ablation logit diff: 0.7634024620
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 48000:
Heads ablated:            []
Original logit diff:      0.1296335012
Post ablation logit diff: 0.1296335012
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 85.71428571428571%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 49000:
Heads ablated:            [(4, 4), (4, 6)]
Original logit diff:      0.9950267076
Post ablation logit diff: 2.9128465652
Logit diff % change:      192.74%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 88.09523809523809%
Checkpoint 50000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      1.1135458946
Post ablation logit diff: 2.7943894863
Logit diff % change:      150.95%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 51000:
Heads ablated:            []
Original logit diff:      0.3575604558
Post ablation logit diff: 0.3575604558
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 52000:
Heads ablated:            []
Original logit diff:      0.2349558175
Post ablation logit diff: 0.2349558175
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 53000:
Heads ablated:            []
Original logit diff:      0.6085900664
Post ablation logit diff: 0.6085900664
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 54000:
Heads ablated:            []
Original logit diff:      0.4084658921
Post ablation logit diff: 0.4084658921
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 55000:
Heads ablated:            []
Original logit diff:      0.4217540324
Post ablation logit diff: 0.4217540324
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 56000:
Heads ablated:            []
Original logit diff:      0.4323191643
Post ablation logit diff: 0.4323191643
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 57000:
Heads ablated:            []
Original logit diff:      0.6403176188
Post ablation logit diff: 0.6403176188
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 58000:
Heads ablated:            []
Original logit diff:      0.5754644871
Post ablation logit diff: 0.5754644871
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 59000:
Heads ablated:            []
Original logit diff:      1.0096291304
Post ablation logit diff: 1.0096291304
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 60000:
Heads ablated:            []
Original logit diff:      0.6222140789
Post ablation logit diff: 0.6222140789
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 61000:
Heads ablated:            []
Original logit diff:      0.7324095368
Post ablation logit diff: 0.7324095368
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Checkpoint 62000:
Heads ablated:            [(3, 5), (4, 4), (4, 6)]
Original logit diff:      0.6263018250
Post ablation logit diff: 2.2275896072
Logit diff % change:      255.67%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 4.0 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 36.19047619047619%
Checkpoint 63000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      0.3326402009
Post ablation logit diff: 2.4476330280
Logit diff % change:      635.82%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 64000:
Heads ablated:            []
Original logit diff:      0.7449644208
Post ablation logit diff: 0.7449644208
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 99.52380952380952%




Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 39.04761904761905%
Checkpoint 65000:
Heads ablated:            [(4, 6), (4, 4)]
Original logit diff:      0.5483941436
Post ablation logit diff: 2.4920144081
Logit diff % change:      354.42%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 66000:
Heads ablated:            []
Original logit diff:      0.2861988246
Post ablation logit diff: 0.2861988246
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 67000:
Heads ablated:            []
Original logit diff:      0.5615631938
Post ablation logit diff: 0.5615631938
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 68000:
Heads ablated:            []
Original logit diff:      1.0039848089
Post ablation logit diff: 1.0039848089
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 69000:
Heads ablated:            []
Original logit diff:      0.7860768437
Post ablation logit diff: 0.7860768437
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 70000:
Heads ablated:            []
Original logit diff:      0.6147008538
Post ablation logit diff: 0.6147008538
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 71000:
Heads ablated:            []
Original logit diff:      0.6746646762
Post ablation logit diff: 0.6746646762
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 72000:
Heads ablated:            []
Original logit diff:      0.9760091305
Post ablation logit diff: 0.9760091305
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 73000:
Heads ablated:            []
Original logit diff:      0.3267331719
Post ablation logit diff: 0.3267331719
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 74000:
Heads ablated:            []
Original logit diff:      0.7971808314
Post ablation logit diff: 0.7971808314
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 64.28571428571429%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 4.0 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 93.80952380952381%
Checkpoint 75000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.1481638998
Post ablation logit diff: 0.8562752008
Logit diff % change:      477.92%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 15.238095238095239%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 4.0 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 59.523809523809526%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 9.047619047619047%




Checkpoint 76000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.2394576818
Post ablation logit diff: 0.9086235166
Logit diff % change:      279.45%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 64.76190476190476%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 4.0 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.2 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Checkpoint 77000:
Heads ablated:            [(3, 3), (3, 5), (4, 6), (4, 4)]
Original logit diff:      0.4678331614
Post ablation logit diff: 0.9344913960
Logit diff % change:      99.75%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 78000:
Heads ablated:            []
Original logit diff:      0.5213122368
Post ablation logit diff: 0.5213122368
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 26.666666666666668%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 4.2 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 4.0 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 71.9047619047619%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%




Checkpoint 79000:
Heads ablated:            [(4, 6), (4, 4), (3, 3), (3, 5)]
Original logit diff:      0.2219315469
Post ablation logit diff: 0.9247933626
Logit diff % change:      316.70%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 80000:
Heads ablated:            []
Original logit diff:      0.6110703945
Post ablation logit diff: 0.6110703945
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 81000:
Heads ablated:            []
Original logit diff:      0.3440084159
Post ablation logit diff: 0.3440084159
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%




Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 96.66666666666667%
Checkpoint 82000:
Heads ablated:            [(3, 3), (3, 5), (4, 6), (4, 4)]
Original logit diff:      -0.2331327200
Post ablation logit diff: 0.3311641216
Logit diff % change:      -242.05%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 83000:
Heads ablated:            []
Original logit diff:      0.7873736620
Post ablation logit diff: 0.7873736620
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 94.76190476190476%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.2 (sign=1) : Top 5 accuracy: 4.761904761904762%
Checkpoint 84000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.8117069602
Post ablation logit diff: 0.8107366562
Logit diff % change:      -0.12%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 85000:
Heads ablated:            []
Original logit diff:      0.7867392302
Post ablation logit diff: 0.7867392302
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 92.38095238095238%
Copy circuit for head 4.2 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 56.666666666666664%




Checkpoint 86000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.4473558962
Post ablation logit diff: 0.8085982203
Logit diff % change:      80.75%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 87000:
Heads ablated:            []
Original logit diff:      0.5273746848
Post ablation logit diff: 0.5273746848
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 90.95238095238095%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 61.904761904761905%




Checkpoint 88000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.4967110157
Post ablation logit diff: 0.6719725728
Logit diff % change:      35.28%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 4.2 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 91.9047619047619%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 57.61904761904761%




Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 89000:
Heads ablated:            [(3, 3), (3, 5), (4, 4), (4, 6)]
Original logit diff:      0.2750924528
Post ablation logit diff: 0.3040034473
Logit diff % change:      10.51%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 90.95238095238095%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 90000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.0590756014
Post ablation logit diff: 0.2464233190
Logit diff % change:      317.13%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 63.8095238095238%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 90.95238095238095%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 6.666666666666667%




Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 91000:
Heads ablated:            [(3, 3), (3, 5), (4, 4), (4, 6)]
Original logit diff:      0.2936813235
Post ablation logit diff: 0.3727281392
Logit diff % change:      26.92%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 47.61904761904761%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 92.38095238095238%




Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 92000:
Heads ablated:            [(3, 3), (3, 5), (4, 6), (4, 4)]
Original logit diff:      0.2990749776
Post ablation logit diff: 0.4748756289
Logit diff % change:      58.78%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 90.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 91.9047619047619%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 93000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.2925741076
Post ablation logit diff: 0.5181595683
Logit diff % change:      77.10%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 89.04761904761904%
Copy circuit for head 2.7 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 88.57142857142857%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 58.0952380952381%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 6.666666666666667%
Checkpoint 94000:
Heads ablated:            [(4, 6), (4, 4), (3, 5), (3, 3)]
Original logit diff:      0.6073801517
Post ablation logit diff: 0.9710022211
Logit diff % change:      59.87%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 56.19047619047619%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 88.57142857142857%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 89.52380952380953%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%




Checkpoint 95000:
Heads ablated:            [(3, 3), (4, 4), (3, 5), (4, 6)]
Original logit diff:      0.6306185126
Post ablation logit diff: 1.0065962076
Logit diff % change:      59.62%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 52.38095238095239%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 84.76190476190476%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 83.33333333333334%
Checkpoint 96000:
Heads ablated:            [(4, 6), (3, 5), (3, 3), (4, 4)]
Original logit diff:      0.7083563805
Post ablation logit diff: 0.7913498878
Logit diff % change:      11.72%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 50.95238095238095%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 79.52380952380952%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 79.04761904761905%




Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 97000:
Heads ablated:            [(3, 3), (3, 5), (4, 6), (4, 4)]
Original logit diff:      0.3010700345
Post ablation logit diff: 0.6712829471
Logit diff % change:      122.97%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 74.28571428571429%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 98000:
Heads ablated:            [(4, 6), (3, 5), (3, 3)]
Original logit diff:      0.2624198496
Post ablation logit diff: 0.2571922839
Logit diff % change:      -1.99%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 2.7 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 50.95238095238095%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 72.38095238095238%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 59.523809523809526%
Checkpoint 99000:
Heads ablated:            [(4, 6), (3, 3)]
Original logit diff:      0.2385216355
Post ablation logit diff: 0.9398254156
Logit diff % change:      294.02%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 75.23809523809524%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 57.14285714285714%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 95.71428571428572%
Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 47.61904761904761%
Copy circuit for head 2.5 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Checkpoint 100000:
Heads ablated:            [(3, 5), (3, 3), (4, 6)]
Original logit diff:      0.5716239810
Post ablation logit diff: 0.5129665732
Logit diff % change:      -10.26%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 61.904761904761905%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 50.0%




Copy circuit for head 4.7 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 94.76190476190476%
Checkpoint 101000:
Heads ablated:            [(3, 3), (4, 6)]
Original logit diff:      0.5246245265
Post ablation logit diff: 0.9023212790
Logit diff % change:      71.99%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 102000:
Heads ablated:            []
Original logit diff:      0.3756940365
Post ablation logit diff: 0.3756940365
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 45.23809523809524%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 89.52380952380953%




Checkpoint 103000:
Heads ablated:            [(3, 3), (4, 6)]
Original logit diff:      0.3557770848
Post ablation logit diff: 0.7098906636
Logit diff % change:      99.53%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 46.19047619047619%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 87.14285714285714%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 1.9047619047619049%




Checkpoint 104000:
Heads ablated:            [(3, 3), (4, 6)]
Original logit diff:      0.5528450012
Post ablation logit diff: 0.7790927887
Logit diff % change:      40.92%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 105000:
Heads ablated:            []
Original logit diff:      0.6195412874
Post ablation logit diff: 0.6195412874
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 30.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 3.0 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Checkpoint 106000:
Heads ablated:            [(3, 3), (4, 6)]
Original logit diff:      0.5664122701
Post ablation logit diff: 0.7221776843
Logit diff % change:      27.50%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 70.47619047619048%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 3.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 32.38095238095238%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 27.61904761904762%




Checkpoint 107000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.6464560628
Post ablation logit diff: 0.4299367070
Logit diff % change:      -33.49%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 108000:
Heads ablated:            []
Original logit diff:      0.2972801626
Post ablation logit diff: 0.2972801626
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 51.90476190476191%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 109000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4022381604
Post ablation logit diff: 0.7478917241
Logit diff % change:      85.93%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 21.904761904761905%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 40.0%
Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%




Checkpoint 110000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.7161034942
Post ablation logit diff: 1.0572936535
Logit diff % change:      47.65%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 36.19047619047619%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 111000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3562529385
Post ablation logit diff: 0.5876967311
Logit diff % change:      64.97%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Checkpoint 112000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.7781585455
Post ablation logit diff: 1.0054256916
Logit diff % change:      29.21%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 14.285714285714285%
Checkpoint 113000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.2637522519
Post ablation logit diff: 0.2178120762
Logit diff % change:      -17.42%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 8.571428571428571%




Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 114000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.2372829169
Post ablation logit diff: 0.2857869864
Logit diff % change:      20.44%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 115000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.7258030772
Post ablation logit diff: 1.1025993824
Logit diff % change:      51.91%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%




Checkpoint 116000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4999492764
Post ablation logit diff: 0.5994606018
Logit diff % change:      19.90%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Checkpoint 117000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.5295794010
Post ablation logit diff: 0.5739812255
Logit diff % change:      8.38%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 3.5 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 118000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.6762881875
Post ablation logit diff: 0.6997689605
Logit diff % change:      3.47%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 8.095238095238095%




Checkpoint 119000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4347186089
Post ablation logit diff: 0.5855047107
Logit diff % change:      34.69%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%




Checkpoint 120000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.6068837047
Post ablation logit diff: 0.9287611842
Logit diff % change:      53.04%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 8.571428571428571%




Checkpoint 121000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.8142279387
Post ablation logit diff: 1.2300114632
Logit diff % change:      51.06%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 8.095238095238095%
Checkpoint 122000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.1607776731
Post ablation logit diff: 0.4940848351
Logit diff % change:      207.31%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 123000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3641097546
Post ablation logit diff: 0.6507028937
Logit diff % change:      78.71%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 4.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 124000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.5563088655
Post ablation logit diff: 0.6837146282
Logit diff % change:      22.90%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 125000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.8047014475
Post ablation logit diff: 1.0347937346
Logit diff % change:      28.59%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 126000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4695343077
Post ablation logit diff: 0.5832934976
Logit diff % change:      24.23%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer




Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 127000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.7830362916
Post ablation logit diff: 1.1844577789
Logit diff % change:      51.26%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 128000:
Heads ablated:            []
Original logit diff:      0.5455057025
Post ablation logit diff: 0.5455057025
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 129000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4639167190
Post ablation logit diff: 0.6947611570
Logit diff % change:      49.76%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 130000:
Heads ablated:            []
Original logit diff:      0.6102454662
Post ablation logit diff: 0.6102454662
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 131000:
Heads ablated:            []
Original logit diff:      0.3642901182
Post ablation logit diff: 0.3642901182
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 132000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.5015211701
Post ablation logit diff: 0.8407520652
Logit diff % change:      67.64%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 133000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4646611810
Post ablation logit diff: 0.8741306663
Logit diff % change:      88.12%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 134000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.6061727405
Post ablation logit diff: 0.7949532866
Logit diff % change:      31.14%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 135000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.1763992012
Post ablation logit diff: 0.4546210766
Logit diff % change:      157.72%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Checkpoint 136000:
Heads ablated:            []
Original logit diff:      0.6439035535
Post ablation logit diff: 0.6439035535
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 137000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3428845406
Post ablation logit diff: 0.4921698570
Logit diff % change:      43.54%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 138000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4177483618
Post ablation logit diff: 0.7855994701
Logit diff % change:      88.06%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer




Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 139000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3257012963
Post ablation logit diff: 0.7338541746
Logit diff % change:      125.32%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 4.761904761904762%




Checkpoint 140000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4503014088
Post ablation logit diff: 0.7627047300
Logit diff % change:      69.38%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.1 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 141000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.4758135080
Post ablation logit diff: 0.7533391714
Logit diff % change:      58.33%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer




Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 142000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3878436387
Post ablation logit diff: 0.5403104424
Logit diff % change:      39.31%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-70m into HookedTransformer
Copy circuit for head 3.3 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 143000:
Heads ablated:            [(3, 3)]
Original logit diff:      0.3769233525
Post ablation logit diff: 0.7746383548
Logit diff % change:      105.52%


In [9]:
experiment_metrics.keys()

dict_keys([4000, 5000, 6000, 7000])

## View Results

#### Pythia 160m

In [11]:
MODEL_TO_VIEW = "pythia-160m-alldropout"

In [12]:
experiment_metrics = torch.load(f'results/backup/{MODEL_TO_VIEW}/nmh_backup_metrics.pt')

In [13]:
experiment_metrics[4000].keys()

dict_keys(['logit_diff', 'per_head_logit_diffs', 'ablation_targets', 'ablated_logit_diff', 'per_head_ablated_logit_diffs', 'per_head_logit_diff_delta', 'in_circuit_head_delta', 'outside_circuit_head_delta', 'summed_in_circuit_head_delta', 'summed_outside_circuit_head_delta', 'summed_total_head_delta'])

In [14]:
summed_in_circuit_head_deltas = {checkpoint: experiment_metrics[checkpoint]["summed_in_circuit_head_delta"] for checkpoint in experiment_metrics.keys()}
summed_outside_circuit_head_deltas = {checkpoint: experiment_metrics[checkpoint]["summed_outside_circuit_head_delta"] for checkpoint in experiment_metrics.keys()}
summed_total_head_deltas = {checkpoint: experiment_metrics[checkpoint]["summed_total_head_delta"] for checkpoint in experiment_metrics.keys()}
per_head_logit_diff_deltas = {checkpoint: experiment_metrics[checkpoint]["per_head_logit_diff_delta"] for checkpoint in experiment_metrics.keys()}
total_logit_diff_deltas = {checkpoint: experiment_metrics[checkpoint]['ablated_logit_diff'] - experiment_metrics[checkpoint]['logit_diff'] for checkpoint in experiment_metrics.keys()}

for checkpoint in experiment_metrics.keys():
    # divide by total original logit diff
    summed_in_circuit_head_deltas[checkpoint] = summed_in_circuit_head_deltas[checkpoint] / experiment_metrics[checkpoint]["logit_diff"]
    summed_outside_circuit_head_deltas[checkpoint] = summed_outside_circuit_head_deltas[checkpoint] / experiment_metrics[checkpoint]["logit_diff"]
    summed_total_head_deltas[checkpoint] = summed_total_head_deltas[checkpoint] / experiment_metrics[checkpoint]["logit_diff"]
    per_head_logit_diff_deltas[checkpoint] = per_head_logit_diff_deltas[checkpoint] / experiment_metrics[checkpoint]["logit_diff"]
    total_logit_diff_deltas[checkpoint] = total_logit_diff_deltas[checkpoint] / experiment_metrics[checkpoint]["logit_diff"]

In [15]:
# plot summed_in_circuit_head_deltas with plotly express
fig = px.line(
    x=list(summed_in_circuit_head_deltas.keys()), 
    y=list(summed_in_circuit_head_deltas.values()), 
    title=f"Summed Post-NMH-Ablation In-Circuit Head Logit Diff Change Over Time ({MODEL_TO_VIEW})",
    labels={'x': 'Checkpoint', 'y': 'Change as % of original logit diff'} 
)
fig.show()


In [16]:
# plot summed_outside_circuit_head_deltas
fig = px.line(
    x=list(summed_outside_circuit_head_deltas.keys()), 
    y=list(summed_outside_circuit_head_deltas.values()), 
    title=f"Summed Post-NMH-Ablation Outside-Circuit Head Attribution Change ({MODEL_TO_VIEW})",
    labels={'x': 'Checkpoint', 'y': 'Change as % of original logit diff'} 
)
fig.show()

In [17]:
# plot total_head_deltas
fig = px.line(
    x=list(summed_total_head_deltas.keys()), 
    y=list(summed_total_head_deltas.values()), 
    title=f"Summed Total Post-NMH-Ablation Head Attribution Change ({MODEL_TO_VIEW})",
    labels={'x': 'Checkpoint', 'y': 'Change as % of original logit diff'}
)

fig.show()

In [9]:
cumulative_nmhs, checkpoint_nmhs = get_past_nmhs_for_checkpoints(experiment_metrics)

In [18]:
top_backup_heads = plot_top_heads(model_name=MODEL_TO_VIEW, checkpoint_dict=per_head_logit_diff_deltas, cumulative_nmhs=cumulative_nmhs, top_k_per_checkpoint=10)

In [57]:
#per_head_logit_diff_deltas

imshow_p(
    experiment_metrics[143000]['per_head_logit_diff_delta'], #[143000],
    title="Headwise logit diff contribution, post NMH KO",
    labels={"x": "Head", "y": "Layer", "color": "Logit diff attribution"},
    #coloraxis=dict(colorbar_ticksuffix = "%"),
    border=True,
    width=600,
    margin={"r": 100, "l": 100}
)

In [58]:
experiment_metrics[143000].keys()

dict_keys(['logit_diff', 'per_head_logit_diffs', 'ablation_targets', 'ablated_logit_diff', 'per_head_ablated_logit_diffs', 'per_head_logit_diff_delta', 'in_circuit_head_delta', 'outside_circuit_head_delta', 'summed_in_circuit_head_delta', 'summed_outside_circuit_head_delta', 'summed_total_head_delta'])

In [59]:
top_backup_heads[top_backup_heads['Previous NMH']==True].head(50)

Unnamed: 0,Checkpoint,Layer-Head,Layer,Head,Value,Previous NMH,Checkpoint_sum,Value_sum,Previous NMH_sum,Top K
72,20000,Layer 9-Head 4,9,4,0.00246,True,3135000,0.334368,38,True
97,25000,Layer 9-Head 4,9,4,0.006947,True,3135000,0.334368,38,True
121,30000,Layer 9-Head 4,9,4,0.006404,True,3135000,0.334368,38,True
149,35000,Layer 9-Head 4,9,4,0.014621,True,3135000,0.334368,38,True
156,37000,Layer 9-Head 4,9,4,0.01344,True,3135000,0.334368,38,True
171,40000,Layer 9-Head 4,9,4,0.010845,True,3135000,0.334368,38,True
197,45000,Layer 9-Head 4,9,4,0.035425,True,3135000,0.334368,38,True
202,46000,Layer 9-Head 4,9,4,0.033058,True,3135000,0.334368,38,True
207,47000,Layer 9-Head 4,9,4,0.014968,True,3135000,0.334368,38,True
212,48000,Layer 9-Head 4,9,4,0.008163,True,3135000,0.334368,38,True


In [60]:
checkpoint_nmhs

{4000: set(),
 5000: set(),
 6000: set(),
 7000: set(),
 8000: {(8, 2)},
 9000: set(),
 10000: {(8, 1), (8, 10)},
 11000: {(10, 7)},
 12000: {(8, 2), (10, 7)},
 13000: {(10, 7)},
 14000: {(8, 2), (10, 7)},
 15000: {(8, 2), (10, 7)},
 16000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 17000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 18000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 19000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 20000: {(8, 2), (10, 7)},
 21000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 22000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 23000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 24000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 25000: {(8, 2), (10, 7)},
 26000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 27000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 28000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 29000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 30000: {(8, 2), (10, 7)},
 31000: {(8, 1), (8, 2), (9, 4), (10, 7)},
 32000: {(8, 1), (8

In [61]:
cumulative_nmhs

{4000: set(),
 5000: set(),
 6000: set(),
 7000: set(),
 8000: {(8, 2)},
 9000: {(8, 2)},
 10000: {(8, 1), (8, 2), (8, 10)},
 11000: {(8, 1), (8, 2), (8, 10), (10, 7)},
 12000: {(8, 1), (8, 2), (8, 10), (10, 7)},
 13000: {(8, 1), (8, 2), (8, 10), (10, 7)},
 14000: {(8, 1), (8, 2), (8, 10), (10, 7)},
 15000: {(8, 1), (8, 2), (8, 10), (10, 7)},
 16000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 17000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 18000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 19000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 20000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 21000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 22000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 23000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 24000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 25000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 26000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 27000: {(8, 1), (8, 2), (8, 10), (9, 4), (10, 7)},
 28000: {(8, 1), (8, 2), (8, 10

In [62]:
# plot number of nmhs over time
fig = px.line(
    x=list(checkpoint_nmhs.keys()), 
    y=list([len(heads) for heads in checkpoint_nmhs.values()]), 
    title=f"Number of NMHs Over Time ({MODEL_TO_VIEW})",
    labels={'x': 'Checkpoint', 'y': 'Number of NMHs'}
)
fig.show()