## Setup

### Imports

In [1]:
import os
import json
import glob
import torch
import re
import einops
import pandas as pd
from functools import partial
from torch import Tensor
from torchtyping import TensorType as TT
from jaxtyping import Float

from transformers import AutoModelForCausalLM

import transformer_lens
import transformer_lens.utils as tl_utils
from transformer_lens import HookedTransformer, ActivationCache
import transformer_lens.patching as patching

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
#import seaborn as sns
import matplotlib.pyplot as plt
from utils.data_utils import generate_data_and_caches
from utils.data_processing import (
    load_edge_scores_into_dictionary,
    read_json_file,
    get_ckpts,
    load_metrics,
    compute_ged,
    compute_weighted_ged,
    compute_gtd,
    compute_jaccard_similarity_to_reference,
    compute_jaccard_similarity,
    aggregate_metrics_to_tensors_step_number,
    get_ckpts
)
from utils.metrics import compute_logit_diff, _logits_to_mean_logit_diff
from utils.visualization import plot_attention_heads, imshow_p

### Parameters

In [2]:
TASK = 'ioi'
PERFORMANCE_METRIC = 'logit_diff'
BASE_MODEL = "pythia-160m"
VARIANT = None
CACHE = "model_cache"
IOI_DATASET_SIZE = 70

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f1b66d2bd00>

### Functions

In [3]:
def load_model(BASE_MODEL, VARIANT, CHECKPOINT, CACHE, device):
    if not VARIANT:
        model = HookedTransformer.from_pretrained(
            BASE_MODEL,
            checkpoint_value=CHECKPOINT,
            center_unembed=True,
            center_writing_weights=True,
            fold_ln=True,
            refactor_factored_attn_matrices=False,
            #dtype=torch.bfloat16,
            **{"cache_dir": CACHE},
        )
    else:
        revision = f"step{CHECKPOINT}"
        source_model = AutoModelForCausalLM.from_pretrained(
           VARIANT, revision=revision, cache_dir=CACHE
        ).to(device) #.to(torch.bfloat16)

        model = HookedTransformer.from_pretrained(
            BASE_MODEL,
            hf_model=source_model,
            center_unembed=False,
            center_writing_weights=False,
            fold_ln=False,
            #dtype=torch.bfloat16,
            **{"cache_dir": CACHE},
        )

    model.cfg.use_split_qkv_input = True
    model.cfg.use_attn_result = True
    model.cfg.use_hook_mlp_in = True
    return model

In [4]:
def convert_head_names_to_tuple(head_name):
    head_name = head_name.replace('a', '')
    head_name = head_name.replace('h', '')
    layer, head = head_name.split('.')
    return (int(layer), int(head))

In [5]:
def ablate_top_head_hook(z: TT["batch", "pos", "head_index", "d_head"], hook, head_idx=0):
    z[:, :, head_idx, :] = 0
    return z

In [6]:
def compute_copy_score(model, layer, head, ioi_dataset, verbose=False, neg=False):

    # get the activation cache from IOI dataset
    logits, cache = model.run_with_cache(ioi_dataset.toks.long())
    
    # sign adjustment, optional
    if neg:
        sign = -1
    else:
        sign = 1

    # pass the activations through the first layernorm for block 1 (effectively the result of layer 0's embedding behavior)
    z_0 = cache["blocks.0.hook_resid_post"]

    # pass the activations through the attention weights (values) for the head and add the bias
    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    # pass the activations through the attention weights (output only) for the head
    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])

    # unembed the activations (layernorm already folded in, so no need to pass through that)
    logits = model.unembed(o)

    k = 5
    n_right = 0

    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["IO", "S1", "S2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            if "S" in word:
                name = "S"
            else:
                name = word
            if " " + prompt[name] in pred_tokens:
                n_right += 1
            else:
                if verbose:
                    print("-------")
                    print("Seq: " + ioi_dataset.sentences[seq_idx])
                    print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
                    print(
                        " ".join(
                            [
                                f"({i+1}):{model.tokenizer.decode(token)}"
                                for i, token in enumerate(
                                    torch.topk(
                                        logits[
                                            seq_idx, ioi_dataset.word_idx[word][seq_idx]
                                        ],
                                        k,
                                    ).indices
                                )
                            ]
                        )
                    )
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    if percent_right > 0:
        print(
            f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
        )
    model.reset_hooks()
    return percent_right

In [7]:
def residual_stack_to_logit_diff(
    residual_stack: Float[Tensor, "... batch d_model"],
    cache: ActivationCache,
    logit_diff_directions: Float[Tensor, "batch d_model"],
) -> Float[Tensor, "..."]:
    '''
    Gets the avg logit difference between the correct and incorrect answer for a given
    stack of components in the residual stream.
    '''
    batch_size = residual_stack.size(-2)
    scaled_residual_stack = cache.apply_ln_to_stack(residual_stack, layer=-1, pos_slice=-1)
    return einops.einsum(
        scaled_residual_stack, logit_diff_directions,
        "... batch d_model, batch d_model -> ..."
    ) / batch_size

## Retrieve & Process Data

### Circuit Data

In [8]:
folder_path = f'results/graphs/pythia-160m/{TASK}'
df = load_edge_scores_into_dictionary(folder_path)

Processing file 1/153: results/graphs/pythia-160m/ioi/57000.json
Processing file 2/153: results/graphs/pythia-160m/ioi/141000.json
Processing file 3/153: results/graphs/pythia-160m/ioi/95000.json
Processing file 4/153: results/graphs/pythia-160m/ioi/107000.json
Processing file 5/153: results/graphs/pythia-160m/ioi/34000.json
Processing file 6/153: results/graphs/pythia-160m/ioi/6000.json
Processing file 7/153: results/graphs/pythia-160m/ioi/37000.json
Processing file 8/153: results/graphs/pythia-160m/ioi/39000.json
Processing file 9/153: results/graphs/pythia-160m/ioi/104000.json
Processing file 10/153: results/graphs/pythia-160m/ioi/59000.json
Processing file 11/153: results/graphs/pythia-160m/ioi/67000.json
Processing file 12/153: results/graphs/pythia-160m/ioi/111000.json
Processing file 13/153: results/graphs/pythia-160m/ioi/16.json
Processing file 14/153: results/graphs/pythia-160m/ioi/76000.json
Processing file 15/153: results/graphs/pythia-160m/ioi/1.json
Processing file 16/153:

### Performance Data

In [9]:
directory_path = 'results'
perf_metrics = load_metrics(directory_path)

ckpts = get_ckpts(schedule="exp_plus_detail")
#pythia_evals = aggregate_metrics_to_tensors_step_number("results/pythia-evals/pythia-v1")

# filter everything before 1000 steps
df = df[df['checkpoint'] >= 1000]

df[['source', 'target']] = df['edge'].str.split('->', expand=True)
len(df['target'].unique())

445

In [10]:
perf_metric = perf_metrics['pythia-160m'][TASK][PERFORMANCE_METRIC]

perf_metric = [x.item() for x in perf_metric]

# zip into dictionary with ckpts as key
perf_metric_dict = dict(zip(ckpts, perf_metric))


## Experiments

### Dataset Setup

In [11]:
initial_model = load_model(BASE_MODEL, VARIANT, 143000, CACHE, device)
size=70
ioi_dataset, abc_dataset = generate_data_and_caches(initial_model, size, verbose=True)

answer_tokens = torch.cat((torch.Tensor(ioi_dataset.io_tokenIDs).unsqueeze(1), torch.Tensor(ioi_dataset.s_tokenIDs).unsqueeze(1)), dim=1).to(device)
answer_tokens = answer_tokens.long()



tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer


### Get Experimental Candidates

In [17]:
EXPERIMENTAL_CHECKPOINT = 143000
COPY_SCORE_THRESHOLD = 75.0

#### Experiment Prototype

In [18]:
experimental_model = load_model(BASE_MODEL, VARIANT, EXPERIMENTAL_CHECKPOINT, CACHE, device)
orig_logits, orig_cache = experimental_model.run_with_cache(ioi_dataset.toks.long())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer


In [19]:
answer_residual_directions = experimental_model.tokens_to_residual_directions(answer_tokens)
print("Answer residual directions shape:", answer_residual_directions.shape)

logit_diff_directions = answer_residual_directions[:, 0] - answer_residual_directions[:, 1]
print("Logit difference directions shape:", logit_diff_directions.shape)

# cache syntax - resid_post is the residual stream at the end of the layer, -1 gets the final layer. The general syntax is [activation_name, layer_index, sub_layer_type].
final_residual_stream: Float[Tensor, "batch seq d_model"] = orig_cache["resid_post", -1]
print(f"Final residual stream shape: {final_residual_stream.shape}")

final_token_residual_stream: Float[Tensor, "batch d_model"] = final_residual_stream[torch.arange(final_residual_stream.size(0)), ioi_dataset.word_idx["end"]]
print(f"Final token residual stream shape: {final_token_residual_stream.shape}")

Answer residual directions shape: torch.Size([70, 2, 768])
Logit difference directions shape: torch.Size([70, 768])
Final residual stream shape: torch.Size([70, 21, 768])
Final token residual stream shape: torch.Size([70, 768])


In [15]:
for idx in range(final_token_residual_stream.shape[0]):
    scaled_final_token_residual_stream = orig_cache.apply_ln_to_stack(final_token_residual_stream[idx].unsqueeze(0), layer=-1, pos_slice=(ioi_dataset.word_idx["end"][idx]-21))

In [161]:
first_example = final_token_residual_stream[0].unsqueeze(0)
orig_cache.apply_ln_to_stack(first_example, layer=-1, pos_slice=(ioi_dataset.word_idx["end"][0]-21)).shape

torch.Size([70, 768])

In [156]:
scaled_final_token_residual_stream.shape

torch.Size([70, 768])

In [16]:
scaled_final_token_residual_stream = orig_cache.apply_ln_to_stack(final_token_residual_stream, layer=-1, pos_slice=-2)
print(f"Scaled residual stream shape: {scaled_final_token_residual_stream.shape}")

# scaled_final_token_residual_stream: Float[Tensor, "batch d_model"] = scaled_residual_stream[torch.arange(final_residual_stream.size(0)), ioi_dataset.word_idx["end"]]
# print(f"Final token residual stream shape: {scaled_final_token_residual_stream.shape}")

average_logit_diff = einops.einsum(
    scaled_final_token_residual_stream, logit_diff_directions,
    "batch d_model, batch d_model ->"
) / 70

print(f"Calculated average logit diff: {average_logit_diff:.10f}")
print(f"Original logit difference:     {_logits_to_mean_logit_diff(orig_logits, ioi_dataset).item():.10f}")

#torch.testing.assert_close(average_logit_diff, clean_logit_diff)

Scaled residual stream shape: torch.Size([70, 768])
Calculated average logit diff: 3.2230978012
Original logit difference:     4.1640057564


In [151]:
orig_cache.apply_ln_to_stack?

[0;31mSignature:[0m
[0morig_cache[0m[0;34m.[0m[0mapply_ln_to_stack[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mresidual_stack[0m[0;34m:[0m [0;34m"Float[torch.Tensor, 'num_components *batch_and_pos_dims d_model']"[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlayer[0m[0;34m:[0m [0;34m'Optional[int]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmlp_input[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpos_slice[0m[0;34m:[0m [0;34m'Union[Slice, SliceInput]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_slice[0m[0;34m:[0m [0;34m'Union[Slice, SliceInput]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhas_batch_dim[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m"Float[torch.Tensor, 'num_components *batch_and_pos_dims_out d_model']"[0m

In [120]:
per_head_residual, labels = orig_cache.stack_head_results(layer=-1, return_labels=True)
print(f"Shape of per head residual: {per_head_residual.shape}")
per_head_residual_final_token = per_head_residual[:, torch.arange(per_head_residual.size(1)), ioi_dataset.word_idx["end"]]
print(f"Shape of per head residual: {per_head_residual_final_token.shape}")
per_head_residual_final_token = einops.rearrange(
    per_head_residual_final_token,
    "(layer head) ... -> layer head ...",
    layer=experimental_model.cfg.n_layers
)
print(f"Shape of per head residual: {per_head_residual_final_token.shape}")
per_head_logit_diffs = residual_stack_to_logit_diff(per_head_residual_final_token, orig_cache, logit_diff_directions)

Shape of per head residual: torch.Size([144, 70, 21, 768])
Shape of per head residual: torch.Size([144, 70, 768])
Shape of per head residual: torch.Size([12, 12, 70, 768])


In [121]:
imshow_p(
    per_head_logit_diffs,
    title="Headwise logit diff contribution",
    labels={"x": "Head", "y": "Layer", "color": "Logit diff attribution"},
    #coloraxis=dict(colorbar_ticksuffix = "%"),
    border=True,
    width=600,
    margin={"r": 100, "l": 100}
)

In [122]:
candidate_nmh = df[df['target']=='logits']
candidate_nmh = candidate_nmh[candidate_nmh['in_circuit'] == True]

candidate_list = candidate_nmh[candidate_nmh['checkpoint']==EXPERIMENTAL_CHECKPOINT]['source'].unique().tolist()
candidate_list = [convert_head_names_to_tuple(c) for c in candidate_list if (c[0] != 'm' and c != 'input')]

In [123]:
NMHs = []

for layer, head in candidate_list:
    copy_score = compute_copy_score(experimental_model, layer, head, ioi_dataset, verbose=False, neg=False)
    NMHs.append((layer, head, copy_score))

Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.5 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 9.1 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.9 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.7 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.4 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 95.23809523809523%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 36.666666666666664%
Copy circuit for head 7.11 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy:

In [127]:
heads_to_ablate = [x[:2] for x in NMHs if x[2] >= COPY_SCORE_THRESHOLD]
head_labels = [f"L{l}H{h}" for l in range(experimental_model.cfg.n_layers) for h in range(experimental_model.cfg.n_heads)]

print(f"Heads to ablate: {heads_to_ablate}")


for layer, head in heads_to_ablate:
    ablate_head_hook = partial(ablate_top_head_hook, head_idx=head)
    experimental_model.blocks[layer].attn.hook_z.add_hook(ablate_head_hook)

ablated_logits, ablated_cache = experimental_model.run_with_cache(ioi_dataset.toks)
print(f"Original IOI Metric: {_logits_to_mean_logit_diff(orig_logits, ioi_dataset).item():.4f}")
print(f"Post ablation IOI Metric: {_logits_to_mean_logit_diff(ablated_logits, ioi_dataset).item()}")

experimental_model.reset_hooks()

Heads to ablate: [(8, 10), (8, 2), (5, 0)]
Original IOI Metric: 4.1640
Post ablation IOI Metric: 4.042059421539307


In [98]:
# from path_patching_cm.path_patching import Node, IterNode, path_patch, act_patch
# from path_patching_cm.ioi_dataset import IOIDataset, NAMES
# experimental_model.reset_hooks()
# pm = partial(_logits_to_mean_logit_diff, ioi_dataset=ioi_dataset)

# path_patch_resid_post = path_patch(
#     experimental_model,
#     orig_input=ioi_dataset.toks,
#     new_input=abc_dataset.toks,
#     sender_nodes=IterNode('z'), # This means iterate over all heads in all layers
#     receiver_nodes=Node('resid_post', 11), # This is resid_post at layer 11
#     patching_metric=pm,
#     verbose=True
# )

  0%|          | 0/144 [00:00<?, ?it/s]

results['z'].shape = (layer=12, head=12)


In [99]:
# imshow_p(
#     path_patch_resid_post['z'] * 100,
#     title="Patching output of attention heads (corrupted -> clean)",
#     labels={"x": "Head", "y": "Layer", "color": "Logit diff variation"},
#     coloraxis=dict(colorbar_ticksuffix = "%"),
#     border=True,
#     width=600,
#     margin={"r": 100, "l": 100}
# )

In [125]:
# per_head_ablated_residual, labels = ablated_cache.stack_head_results(layer=-1, pos_slice=-1, return_labels=True)
# per_head_ablated_logit_diffs = residual_stack_to_logit_diff(per_head_ablated_residual, ablated_cache, logit_diff_directions)
# per_head_ablated_logit_diffs = per_head_ablated_logit_diffs.reshape(experimental_model.cfg.n_layers, experimental_model.cfg.n_heads)

per_head_ablated_residual, labels = ablated_cache.stack_head_results(layer=-1, return_labels=True)
print(f"Shape of per head residual: {per_head_ablated_residual.shape}")
per_head_ablated_residual_final_token = per_head_ablated_residual[:, torch.arange(per_head_ablated_residual.size(1)), ioi_dataset.word_idx["end"]]
print(f"Shape of per head residual: {per_head_ablated_residual_final_token.shape}")
per_head_ablated_residual_final_token = einops.rearrange(
    per_head_ablated_residual_final_token,
    "(layer head) ... -> layer head ...",
    layer=experimental_model.cfg.n_layers
)
print(f"Shape of per head residual: {per_head_ablated_residual_final_token.shape}")
per_head_ablated_logit_diffs = residual_stack_to_logit_diff(per_head_ablated_residual_final_token, orig_cache, logit_diff_directions)

Shape of per head residual: torch.Size([144, 70, 21, 768])
Shape of per head residual: torch.Size([144, 70, 768])
Shape of per head residual: torch.Size([12, 12, 70, 768])


In [126]:
#exclusions = [(6, 6), (7, 9), (8, 9)] + [(9, 1), (9, 5)]
delta = per_head_ablated_logit_diffs - per_head_logit_diffs
for layer, head in heads_to_ablate:
    delta[layer, head] = 0

plot_attention_heads(
    delta/_logits_to_mean_logit_diff(orig_logits, ioi_dataset).item(), 
    title="Logit Diff Contribution From Backup Heads (as percentage of original total)", 
    top_n=15, 
    range_x=[0, 0.5]
)

Total logit diff contribution above threshold: 0.15


In [82]:
per_head_logit_diffs[9, 1], per_head_ablated_logit_diffs[9, 1]

(tensor(-0.2894, device='cuda:0'), tensor(-0.1355, device='cuda:0'))

In [83]:
imshow_p(
    per_head_ablated_logit_diffs,
    title="Headwise logit diff contribution, post NMH KO",
    labels={"x": "Head", "y": "Layer", "color": "Logit diff attribution"},
    #coloraxis=dict(colorbar_ticksuffix = "%"),
    border=True,
    width=600,
    margin={"r": 100, "l": 100}
)

In [84]:
imshow_p(
    delta,
    title="Change in headwise logit diff contribution, post NMH KO",
    labels={"x": "Head", "y": "Layer", "color": "Logit diff attribution"},
    #coloraxis=dict(colorbar_ticksuffix = "%"),
    border=True,
    width=600,
    margin={"r": 100, "l": 100}
)

In [None]:
# We should track three things in response to NMH KO:
# 1. The change in the IOI metric
# 2. The change in the logit diff contribution for heads in the circuit at this checkpoint
# 3. The change in the logit diff contribution for heads not in the circuit at this checkpoint

In [85]:
delta.sum()

tensor(0.2135, device='cuda:0')

#### Formalized Experiment

In [133]:
EXPERIMENTAL_CHECKPOINT = 142000
COPY_SCORE_THRESHOLD = 75.0

In [58]:
# Prepare model
def setup(checkpoint=143000):
    model = load_model(BASE_MODEL, VARIANT, checkpoint, CACHE, device)

    answer_residual_directions = model.tokens_to_residual_directions(answer_tokens)
    logit_diff_directions = answer_residual_directions[:, 0] - answer_residual_directions[:, 1]

    # Test logit_diff_directions with logit diff calculation
    # final_residual_stream: Float[Tensor, "batch seq d_model"] = orig_cache["resid_post", -1]

    # scaled_residual_stream = orig_cache.apply_ln_to_stack(final_residual_stream, layer=-1)
    # scaled_final_token_residual_stream: Float[Tensor, "batch d_model"] = scaled_residual_stream[torch.arange(scaled_residual_stream.size(0)), ioi_dataset.word_idx["end"]]
    
    # batch_size = ioi_dataset.toks.shape[0]

    # average_logit_diff = einops.einsum(
    #     scaled_final_token_residual_stream, logit_diff_directions,
    #     "batch d_model, batch d_model ->"
    # ) / batch_size

    # print(f"Calculated logit diff: {average_logit_diff:.10f}")

    return model, logit_diff_directions

# Get metrics & attribution scores
def get_metrics_and_attributions(model, logits, cache, dataset=ioi_dataset, logit_diff_directions=logit_diff_directions):

    logit_diff = _logits_to_mean_logit_diff(logits, dataset).item()

    per_head_residual, labels = cache.stack_head_results(layer=-1, return_labels=True)
    per_head_residual_final_token = per_head_residual[:, torch.arange(per_head_residual.size(1)), dataset.word_idx["end"]]
    per_head_residual_final_token = einops.rearrange(
        per_head_residual_final_token,
        "(layer head) ... -> layer head ...",
        layer=model.cfg.n_layers
    )
    per_head_logit_diffs = residual_stack_to_logit_diff(per_head_residual_final_token, orig_cache, logit_diff_directions)

    return logit_diff, per_head_logit_diffs

# Get copy scores from circuit members
def get_ablation_targets(model, checkpoint, edge_df, dataset=ioi_dataset, threshold=75.0):
    candidate_nmh = edge_df[edge_df['target']=='logits']
    candidate_nmh = candidate_nmh[candidate_nmh['in_circuit'] == True]

    candidate_list = candidate_nmh[candidate_nmh['checkpoint']==checkpoint]['source'].unique().tolist()
    candidate_list = [convert_head_names_to_tuple(c) for c in candidate_list if (c[0] != 'm' and c != 'input')]

    NMHs = []

    for layer, head in candidate_list:
        copy_score = compute_copy_score(model, layer, head, dataset, verbose=False, neg=False)
        NMHs.append((layer, head, copy_score))

    heads_to_ablate = [x[:2] for x in NMHs if x[2] >= threshold]

    return NMHs, heads_to_ablate

# Run ablation experiment
def run_ablated_model(model, dataset=ioi_dataset, ablation_targets=None):
    if ablation_targets is None:
        ablation_targets = get_ablation_targets(model, dataset)

    for layer, head in ablation_targets:
        ablate_head_hook = partial(ablate_top_head_hook, head_idx=head)
        model.blocks[layer].attn.hook_z.add_hook(ablate_head_hook)

    ablated_logits, ablated_cache = model.run_with_cache(dataset.toks)
    
    model.reset_hooks()

    return ablated_logits, ablated_cache

# Run experiment
def run_iteration(edge_df, checkpoint, experiment_metrics):

    model, logit_diff_directions = setup(checkpoint=checkpoint)
    orig_logits, orig_cache = model.run_with_cache(ioi_dataset.toks.long())
    logit_diff, per_head_logit_diffs = get_metrics_and_attributions(model, orig_logits, orig_cache, logit_diff_directions=logit_diff_directions)

    NMHs, ablation_targets = get_ablation_targets(model, checkpoint=checkpoint, edge_df=edge_df, dataset=ioi_dataset, threshold=COPY_SCORE_THRESHOLD)
    ablated_logits, ablated_cache = run_ablated_model(model, ioi_dataset, ablation_targets)
    ablated_logit_diff, per_head_ablated_logit_diffs = get_metrics_and_attributions(model, ablated_logits, ablated_cache, logit_diff_directions=logit_diff_directions)
    
    print(f"Checkpoint {checkpoint}:")
    print(f"Heads ablated:            {ablation_targets}")
    print(f"Original logit diff:      {logit_diff:.10f}")
    print(f"Post ablation logit diff: {ablated_logit_diff:.10f}")
    print(f"Logit diff % change:      {((ablated_logit_diff - logit_diff) / logit_diff) * 100:.2f}%")

    experiment_metrics[checkpoint] = {
        "logit_diff": logit_diff,
        "per_head_logit_diffs": per_head_logit_diffs,
        "ablation_targets": ablation_targets,
        "ablated_logit_diff": ablated_logit_diff,
        "per_head_ablated_logit_diffs": per_head_ablated_logit_diffs,
        "per_head_logit_diff_delta": per_head_ablated_logit_diffs - per_head_logit_diffs
    }

    return experiment_metrics

In [59]:
def process_backup_results(edge_df, checkpoint, experiment_metrics):

    # exclude the delta of the ablated heads
    for layer, head in experiment_metrics[checkpoint]["ablation_targets"]:
        experiment_metrics[checkpoint]["per_head_logit_diff_delta"][layer, head] = 0

    # get the list of heads in the circuit
    circuit_heads = edge_df[edge_df['in_circuit'] == True]
    circuit_heads = circuit_heads[circuit_heads['checkpoint']==checkpoint]['source'].unique().tolist()
    circuit_heads = [convert_head_names_to_tuple(c) for c in circuit_heads if (c[0] != 'm' and c != 'input')]    

    in_circuit_head_delta = torch.zeros_like(experiment_metrics[checkpoint]["per_head_logit_diffs"])
    outside_circuit_head_delta = torch.zeros_like(experiment_metrics[checkpoint]["per_head_logit_diffs"])

    for layer in range(in_circuit_head_delta.shape[0]):
        for head in range(in_circuit_head_delta.shape[1]):
            if (layer, head) in circuit_heads:
                in_circuit_head_delta[layer, head] = experiment_metrics[checkpoint]["per_head_logit_diff_delta"][layer, head]
            else:
                outside_circuit_head_delta[layer, head] = experiment_metrics[checkpoint]["per_head_logit_diff_delta"][layer, head]

    experiment_metrics[checkpoint]["in_circuit_head_delta"] = in_circuit_head_delta
    experiment_metrics[checkpoint]["outside_circuit_head_delta"] = outside_circuit_head_delta

    experiment_metrics[checkpoint]["summed_in_circuit_head_delta"] = in_circuit_head_delta.sum().item()
    experiment_metrics[checkpoint]["summed_outside_circuit_head_delta"] = outside_circuit_head_delta.sum().item()
    experiment_metrics[checkpoint]["summed_total_head_delta"] = experiment_metrics[checkpoint]["per_head_logit_diff_delta"].sum().item()

    # convert tensors to numpy cpu arrays
    experiment_metrics[checkpoint]["per_head_logit_diffs"] = experiment_metrics[checkpoint]["per_head_logit_diffs"].cpu().numpy()
    experiment_metrics[checkpoint]["per_head_ablated_logit_diffs"] = experiment_metrics[checkpoint]["per_head_ablated_logit_diffs"].cpu().numpy()
    experiment_metrics[checkpoint]["per_head_logit_diff_delta"] = experiment_metrics[checkpoint]["per_head_logit_diff_delta"].cpu().numpy()
    experiment_metrics[checkpoint]["in_circuit_head_delta"] = experiment_metrics[checkpoint]["in_circuit_head_delta"].cpu().numpy()
    experiment_metrics[checkpoint]["outside_circuit_head_delta"] = experiment_metrics[checkpoint]["outside_circuit_head_delta"].cpu().numpy()
    

    return experiment_metrics

In [60]:

experiment_metrics = dict()
# create folder
os.makedirs(f'results/backup/{BASE_MODEL}', exist_ok=True)

for checkpoint in range(4000, 144000, 1000):

    experiment_metrics = run_iteration(df, checkpoint=checkpoint, experiment_metrics=experiment_metrics)
    experiment_metrics = process_backup_results(df, checkpoint, experiment_metrics)

    # save to file, using pytorch format
    torch.save(experiment_metrics, f'results/backup/{BASE_MODEL}/nmh_backup_metrics.pt')



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer




Checkpoint 4000:
Heads ablated:            []
Original logit diff:      0.4079501033
Post ablation logit diff: 0.4079501033
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Checkpoint 5000:
Heads ablated:            []
Original logit diff:      1.1939518452
Post ablation logit diff: 1.1939518452
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer




Checkpoint 6000:
Heads ablated:            []
Original logit diff:      1.8560795784
Post ablation logit diff: 1.8560795784
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.952380952380953%
Checkpoint 7000:
Heads ablated:            []
Original logit diff:      1.8438247442
Post ablation logit diff: 1.8438247442
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 13.80952380952381%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 88.09523809523809%
Checkpoint 8000:
Heads ablated:            [(8, 2)]
Original logit diff:      2.3623378277
Post ablation logit diff: 2.5049126148
Logit diff % change:      6.04%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 27.142857142857142%




Checkpoint 9000:
Heads ablated:            []
Original logit diff:      1.9961944818
Post ablation logit diff: 1.9961944818
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 98.09523809523809%
Checkpoint 10000:
Heads ablated:            [(8, 1), (8, 10)]
Original logit diff:      2.4216914177
Post ablation logit diff: 2.9410898685
Logit diff % change:      21.45%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%




Checkpoint 11000:
Heads ablated:            [(10, 7)]
Original logit diff:      2.5263617039
Post ablation logit diff: 2.5399458408
Logit diff % change:      0.54%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 28.095238095238095%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 98.09523809523809%




Checkpoint 12000:
Heads ablated:            [(10, 7), (8, 2)]
Original logit diff:      2.4225206375
Post ablation logit diff: 2.5701422691
Logit diff % change:      6.09%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 86.19047619047619%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 27.61904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 36.666666666666664%




Checkpoint 13000:
Heads ablated:            [(10, 7)]
Original logit diff:      2.1553795338
Post ablation logit diff: 2.1475479603
Logit diff % change:      -0.36%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 27.61904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.476190476190478%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 86.19047619047619%
Checkpoint 14000:
Heads ablated:            [(8, 2), (10, 7)]
Original logit diff:      2.3155550957
Post ablation logit diff: 2.3603396416
Logit diff % change:      1.93%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 84.76190476190476%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.476190476190478%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%




Checkpoint 15000:
Heads ablated:            [(10, 7), (8, 2)]
Original logit diff:      2.7272334099
Post ablation logit diff: 2.7891819477
Logit diff % change:      2.27%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 79.52380952380952%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 27.142857142857142%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 84.76190476190476%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 26.190476190476193%




Checkpoint 16000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (9, 4), (10, 7)]
Original logit diff:      2.5029132366
Post ablation logit diff: 3.2023310661
Logit diff % change:      27.94%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 83.33333333333334%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 81.9047619047619%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 17000:
Heads ablated:            [(10, 7), (9, 4), (8, 1), (8, 2), (8, 10)]
Original logit diff:      2.7875125408
Post ablation logit diff: 3.0608634949
Logit diff % change:      9.81%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 82.85714285714286%
Checkpoint 18000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      2.5325748920
Post ablation logit diff: 2.8281438351
Logit diff % change:      11.67%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.61904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 83.80952380952381%




Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 27.142857142857142%
Checkpoint 19000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      2.8950085640
Post ablation logit diff: 2.9904725552
Logit diff % change:      3.30%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 86.66666666666667%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%




Checkpoint 20000:
Heads ablated:            [(10, 7), (8, 2)]
Original logit diff:      2.7564985752
Post ablation logit diff: 2.7654702663
Logit diff % change:      0.33%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 82.38095238095238%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 28.57142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 90.0%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 15.238095238095239%




Checkpoint 21000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      2.5499055386
Post ablation logit diff: 2.6521060467
Logit diff % change:      4.01%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.04761904761904%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 82.85714285714286%




Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 21.904761904761905%
Checkpoint 22000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.0459568501
Post ablation logit diff: 3.3277535439
Logit diff % change:      9.25%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.61904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 34.285714285714285%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 80.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.095238095238095%
Checkpoint 23000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.2300357819
Post ablation logit diff: 3.3783571720
Logit diff % change:      4.59%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.14285714285714%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 84.28571428571429%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 25.238095238095237%
Checkpoint 24000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit d

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.61904761904762%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 25000:
Heads ablated:            [(10, 7), (8, 2)]
Original logit diff:      2.9707119465
Post ablation logit diff: 2.9417746067
Logit diff % change:      -0.97%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.3 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 80.47619047619048%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 24.285714285714285%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.14285714285714%
Copy circuit for



Checkpoint 26000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (9, 4), (10, 7)]
Original logit diff:      3.0810892582
Post ablation logit diff: 3.2433164120
Logit diff % change:      5.27%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.52380952380953%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 78.57142857142857%
Checkpoint 27000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      2.7954201698
Post ablation logit diff: 2.95267987

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 85.23809523809524%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.9047619047619%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.14285714285714%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 18.571428571428573%
Checkp

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 28.57142857142857%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 90.0%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 34.76190476190476%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 16.19047619047619%
Checkpoint 29000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit di

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 88.09523809523809%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 34.76190476190476%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 30000:
Heads ablated:            [(10, 7), (8, 2)]
Original logit diff:      3.0919957161
Post ablation logit diff: 3.0055809021
Logit diff % change:      -2.79%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 80.95238095238095%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 21.904761904761905%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.04761904761904%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 16.19047619047619%
Checkpoint 31000:
Heads ablated:            [(9, 4), (8, 2), (8, 1), (10, 7)]
Original logit diff:      2.8969213963
Post ablation logit diff: 3.1643812656
Logit diff % change:      9.23%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 15.238095238095239%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.04761904761904%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.9047619047619%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 22.380952380952383%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 81.42857142857143%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Checkpoint 32000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      2.9847354889
Post ablation logit diff: 3.0086522102
Logit diff % change:      0.80%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 81.42857142857143%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 28.095238095238095%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.38095238095238%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.04761904761904%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 14.761904761904763%




Checkpoint 33000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.0808959007
Post ablation logit diff: 3.2220525742
Logit diff % change:      4.58%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 6.11 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.14285714285714%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 83.80952380952381%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Checkpoint 34000:
Heads ablated:            [(10, 7), (9, 4), (8, 1), (8, 2), (8, 10)]
Original logit diff:      3.1579537392
Post ablation logit diff: 3.2764453888
Logit diff % change:      3.75%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 31.9047619047619%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.52380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%




Checkpoint 35000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.3524379730
Post ablation logit diff: 3.2118079662
Logit diff % change:      -4.19%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 13.333333333333334%
Copy circuit for head 6.11 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 90.47619047619048%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Co

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 89.52380952380953%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 14.285714285714285%
Checkpoint 37000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.2621905804
Post ablation logit diff: 3.1055059433
Logit diff % change:      -4.80%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 88.57142857142857%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 36.19047619047619%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 86.19047619047619%
Checkpoint 38000:
Heads ablated:            [(10, 7), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.6383097172
Post ablation logit diff: 3.49257

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 85.71428571428571%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 87.61904761904762%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 13.333333333333334%
Chec

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 34.285714285714285%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 85.71428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 6.666666666666667%




Checkpoint 40000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.3140232563
Post ablation logit diff: 3.1424901485
Logit diff % change:      -5.18%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 14.761904761904763%
Copy circuit for head 6.11 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 84.28571428571429%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 34.285714285714285%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 84.76190476190476%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 32.857142857142854%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 88.57142857142857%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Checkpoint 42000:
Heads ablated:            [(10, 7), (8, 2), (9, 4)]
Original logit diff:      2.9047389030
Post ablation logit diff: 3.1141977310
Logit diff % change:      7.21%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 91.42857142857143%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 30.0%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 10.6 (sign=



Checkpoint 43000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.2269480228
Post ablation logit diff: 3.4437601566
Logit diff % change:      6.72%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 90.47619047619048%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 28.57142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Checkpoint 44000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (10, 7)]
Original lo

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 78.57142857142857%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 28.57142857142857%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 32.38095238095238%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Checkpoint 45000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10)]
Original logit diff:      3.5383903980
Post ablation logit diff: 3.4805328846
Logit diff % change:      -1.64%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 8.4 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 33.33333333333333%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 80.0%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 6.11 (sign



Checkpoint 46000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.1439352036
Post ablation logit diff: 3.3119297028
Logit diff % change:      5.34%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 79.04761904761905%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 9.523809523809524%
Checkpoint 47000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      3.8844797611
Post ablation logit diff: 3.8075649738
Logit diff % change:      -1.98%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.476190476190476%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 78.57142857142857%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 6.666666666666667%
Checkpoint 48000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.0278050900
Post ablation logit diff: 3.0812087059
Logit diff % change:      1.76%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 94.28571428571428%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 34.285714285714285%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 6.11 (sign



Checkpoint 49000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (9, 4), (10, 7)]
Original logit diff:      3.4284999371
Post ablation logit diff: 3.6376180649
Logit diff % change:      6.10%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 93.80952380952381%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 21.904761904761905%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 77.61904761904762%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 9.523809523809524%




Checkpoint 50000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (9, 4), (10, 7)]
Original logit diff:      3.2632486820
Post ablation logit diff: 3.3333494663
Logit diff % change:      2.15%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 79.52380952380952%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 93.80952380952381%
Checkpoint 51000:
Heads ablated:            [(10, 7), (8, 2), (9, 4)]
Original logit diff:      2.9528236389
Post ablation logit diff: 3.0399122238
Logit diff % change:      2.95%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.76190476190476%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 43.333333333333336%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 52000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.8311219215
Post ablation logit diff: 3.9337983131
Logit diff % change:      2.68%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 37.61904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for 



Checkpoint 53000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.4068624973
Post ablation logit diff: 3.8794670105
Logit diff % change:      13.87%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 94.76190476190476%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 40.476190476190474%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%




Checkpoint 54000:
Heads ablated:            [(8, 10), (8, 2), (9, 4), (10, 7)]
Original logit diff:      3.7859270573
Post ablation logit diff: 3.8625731468
Logit diff % change:      2.02%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 40.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 94.28571428571428%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 11.428571428571429%
Checkpoint 55000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.9617173672
Post ablation logit diff: 4.1993093491
Logit diff % change:      6.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 92.38095238095238%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 37.61904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 8.8 (sign=

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 6.190476190476191%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.476190476190474%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 43.333333333333336%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 90.47619047619048%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 13.80952380952381%
Checkpoint 57000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.4276928902
Post ablation logit diff: 3.7153589725
Logit diff % change:      8.39%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 81.42857142857143%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Checkpoint 58000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (10, 7)]
Original logit diff:      3.5477247238
Post ablation logit diff: 3.6419198513
Logit diff % change:      2.66%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 6.11 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 77.14285714285715%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 42.857142857142854%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit fo



Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 15.714285714285714%
Checkpoint 59000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10), (9, 4)]
Original logit diff:      3.9921560287
Post ablation logit diff: 4.0272884369
Logit diff % change:      0.88%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 78.0952380952381%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 17.142857142857142%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Checkpoint 60000:
Heads ablated:            [(10, 7), (8, 2), (8, 10), (9, 4)]
Original logit diff:      4.0257959366
Post ablation logit diff: 4.1613192558
Logit diff % change:      3.37%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 49.047619047619044%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 13.333333333333334%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 67.61904761904762%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 61000:
Heads ablated:            [(10, 7), (8, 1), (8, 10), (8, 2)]
Original logit diff:      3.4262354374
Post ablation logit diff: 3.7369272709
Logit diff % change:      9.07%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 67.14285714285714%
Checkpoint 6200

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 62.38095238095238%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.285714285714286%




Checkpoint 63000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.7453613281
Post ablation logit diff: 3.7493555546
Logit diff % change:      0.11%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 58.57142857142858%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 20.476190476190474%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 47.14285714285714%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 12.380952380952381%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Checkpoint 64000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      4.1064085960
Post ablation logit diff: 4.2660937309
Logit diff % change:      3.89%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.761904761904763%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 95.23809523809523%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 52.85714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 46.19047619047619%
Checkpoint 65000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10)]
Original logit diff:      3.8020830154
Post ablation logit diff: 4.0923237801
Logit diff % change:      7.63%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 51.90476190476191%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.761904761904763%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.28571428571429%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Checkpoint 66000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.8306570053
Post ablation logit diff: 4.1305289268
Logit diff % change:      7.83%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 56.19047619047619%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.761904761904763%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%




Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Checkpoint 67000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.7540388107
Post ablation logit diff: 4.1097245216
Logit diff % change:      9.47%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 49.523809523809526%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 51.42857142857142%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 21.904761904761905%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.80952380952381%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%




Checkpoint 68000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.1240215302
Post ablation logit diff: 4.0902123451
Logit diff % change:      -0.82%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 48.57142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 54.285714285714285%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 13.80952380952381%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.71428571428571%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%




Checkpoint 69000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      4.0991587639
Post ablation logit diff: 4.1630911827
Logit diff % change:      1.56%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 43.80952380952381%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 21.904761904761905%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 57.14285714285714%
Checkpoint 70000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      4.2723064423
Post ablation logit diff: 4.0697107315
Logit diff % change:      -4.74%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 15.238095238095239%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 56.19047619047619%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 24.761904761904763%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 45.714285714285715%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 71000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      4.5890569687
Post ablation logit diff: 4.5939931870
Logit diff % change:      0.11%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.38095238095238%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 13.80952380952381%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.333333333333336%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 24.285714285714285%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 33.33333333333333%
Checkpoint 72000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.8

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 75.23809523809524%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 27.142857142857142%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.333333333333336%
Checkpoint 73000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      4.1822485924
Post ablation logit diff: 4.1410803795
Logit diff % change:      -0.98%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 37.142857142857146%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 28.095238095238095%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.80952380952381%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.38095238095238%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoint 74000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.1615295410
Post ablation logit diff: 4.2686285973
Logit diff % change:   

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.33333333333333%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 26.190476190476193%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 37.61904761904762%
Checkpoint 75000:
Heads ablated:            [(8, 1), (8, 2), (8, 10)]
Original logit diff: 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 31.9047619047619%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 24.761904761904763%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 49.523809523809526%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.85714285714285%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%




Checkpoint 76000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.1687951088
Post ablation logit diff: 4.0954875946
Logit diff % change:      -1.76%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.333333333333336%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.76190476190476%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 24.761904761904763%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 77000:
He

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 22.857142857142858%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 52.85714285714286%
Checkpoint 78000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.9371225834
Post ablation logit diff: 4.3170328140
Logit diff % change:      9.65%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 24.761904761904763%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 22.380952380952383%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 52.38095238095239%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.476190476190476%




Checkpoint 79000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.3646335602
Post ablation logit diff: 4.4347558022
Logit diff % change:      1.61%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 22.380952380952383%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 45.714285714285715%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 13.80952380952381%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 70.47619047619048%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.476190476190476%




Checkpoint 80000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.4647302628
Post ablation logit diff: 4.4390048981
Logit diff % change:      -0.58%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 70.0%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 17.142857142857142%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.476190476190476%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 51.90476190476191%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 20.476190476190474%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 24.761904761904763%




Checkpoint 81000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      4.2701420784
Post ablation logit diff: 4.4379029274
Logit diff % change:      3.93%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 51.90476190476191%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 82000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.7159681320
Post ablation logit diff: 4.0923280716
Logit diff % change:      10.13%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 55.23809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 83000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.9499201775
Post ablation logit diff: 4.1649451256
Logit diff % change:      5.44%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 58.57142857142858%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.904761904761903%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Checkpoint 84000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.6950199604
Post ablation logit diff: 3.9329512119
Logit diff % change:      6.44%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.80952380952381%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Checkpoint 85000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.4282345772
Post ablation logit diff: 4.7502684593
Logit diff % change:      7.27%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 50.476190476190474%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.28571428571429%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 86000:
Heads ablated:            [(8, 1), (8, 10), (8, 2)]
Original logit diff:      4.7322416306
Post ablation logit diff: 5.0137224197
Logit diff % change:      5.9

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 54.761904761904766%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 87000:
Heads ablated:            [(8, 1), (8, 2), (8, 10)]
Original logit diff:      4.2836489677
Post ablation logit diff: 4.2701163292
Logit diff % change:      -0.32%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 88000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      4.8267207146
Post ablation logit diff: 4.4432830811
Logit diff % change:      -7.94%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 89000:
Heads ablated:            [(8, 10)]
Original logit diff:      3.8253905773
Post ablation logit diff: 4.5747113228
Logit diff % change:      19.59%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 49.047619047619044%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 17.61904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.80952380952381%




Checkpoint 90000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.0528397560
Post ablation logit diff: 4.2227921486
Logit diff % change:      4.19%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.33333333333333%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 91000:
Heads ablated:            [(8, 1), (8, 2), (8, 10)]
Original logit diff:      3.5448956490
Post ablation logit diff: 4.0310006142
Logit diff % change:      13.71%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.80952380952381%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 98.09523809523809%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 92000:
Heads ablated:            [(8, 1), (8, 2), (8, 10)]
Original logit diff:      3.1936087608
Post ablation logit diff: 3.6164159775
Logit diff % change:      13.24%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 5.238095238095238%
Checkpoint 93000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.5039720535
Post ablation logit diff: 3.6234147549
Logit diff % change:      3.41%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Checkpoint 94000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.1994807720
Post ablation logit diff: 3.6220529079
Logit diff % change:      13.21%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 95000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.9099199772
Post ablation logit diff: 4.0544338226
Logit diff % change:      3.70%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 44.285714285714285%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 44.285714285714285%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Checkpoint 96000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.5936560631
Post ablation logit diff: 3.7887003422
Logit diff % change:      5.43%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 97000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.8341448307
Post ablation logit diff: 4.0037817955
Logit diff % change:      4.42%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.80952380952381%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Checkpoint 98000:
Heads ablated:            [(8, 10), (8, 2), (8, 1)]
Original logit diff:      3.4808588028
Post ablation logit diff: 3.5355644226
Logit diff % change:      1.57%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.28571428571429%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 47.61904761904761%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.61904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 42.38095238095238%
Checkpoint 99000:
Heads ablated:            [(8, 1), (8, 2), (8, 10)]
Original logit diff:      3.7597773075
Post ablation logit diff: 3.8471848965
Logit diff % change:      2.32%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 15.714285714285714%




Checkpoint 100000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.6237792969
Post ablation logit diff: 3.5648171902
Logit diff % change:      -1.63%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 70.0%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 51.90476190476191%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.523809523809524%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 79.52380952380952%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoint 101000:
Heads ablated

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 71.9047619047619%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 48.57142857142857%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 18.571428571428573%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 80.0%
Copy circuit for head 9.9 (sign=



Checkpoint 102000:
Heads ablated:            [(8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.7126774788
Post ablation logit diff: 3.1538050175
Logit diff % change:      -15.05%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 43.80952380952381%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%




Checkpoint 103000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.0290465355
Post ablation logit diff: 3.9410593510
Logit diff % change:      -2.18%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 53.80952380952381%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Checkpoint 104000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.7538783550
Post ablation logit diff: 3.5125946999
Logit diff % change:      -6.43%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 34.76190476190476%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 97.14285714285714%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 18.095238095238095%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 86.66666666666667%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%




Checkpoint 105000:
Heads ablated:            [(9, 4), (8, 10), (8, 2), (8, 1), (10, 7)]
Original logit diff:      3.3821690083
Post ablation logit diff: 3.1830327511
Logit diff % change:      -5.89%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 81.9047619047619%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 38.57142857142858%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 27.142857142857142%
Co



Checkpoint 106000:
Heads ablated:            [(8, 10), (8, 1), (10, 7)]
Original logit diff:      3.3200852871
Post ablation logit diff: 3.1718740463
Logit diff % change:      -4.46%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 107000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.5197105408
Post ablation logit diff: 3.4115154743
Logit diff % change:      -3.07%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 70.47619047619048%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 26.666666666666668%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Checkpoint 108000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      4.0382037163
Post ablation logit diff: 3.7155592442
Logit diff % change:      -7.99%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 30.476190476190478%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoint 109000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.4698822498
Post ablation logit diff: 3.2715702057
Logit diff % change:      -5.72%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 78.0952380952381%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.428571428571429%




Checkpoint 110000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.4515860081
Post ablation logit diff: 3.1630399227
Logit diff % change:      -8.36%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.523809523809526%




Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Checkpoint 111000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.5362937450
Post ablation logit diff: 3.4552791119
Logit diff % change:      -2.29%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 94.28571428571428%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 12.857142857142856%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 19.047619047619047%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 84.76190476190476%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 112000:
Heads abl

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 11.428571428571429%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.38095238095238%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 17.142857142857142%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 26.666666666666668%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 70.95238095238095%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 15.238095238095239%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 93.80952380952381%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Chec

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 64.28571428571429%
Checkpoint 114000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.5887677670
Post ablation logit diff: 3.3340165615
Logit diff % change:      -7.10%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer




Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 115000:
Heads ablated:            [(8, 10)]
Original logit diff:      3.6147425175
Post ablation logit diff: 3.7619035244
Logit diff % change:      4.07%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 80.47619047619048%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 23.809523809523807%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 116000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      3.9088082314
Post ablation logit diff: 3.5745005608
Logit diff % change:      -8.55%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Checkpoint 117000:
Heads ablated:            [(8, 10)]
Original logit diff:      3.7798814774
Post ablation logit diff: 3.9516112804
Logit diff % change:      4.54%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 19.523809523809526%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 56.666666666666664%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 20.476190476190474%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 80.47619047619048%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoint 118000:
Heads ablated:            [(8, 10), (8, 2), (10, 7)]
Original logit diff:      3.9797604084
Post ablation logit diff: 3.5991773605
Logit diff % c

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 15.714285714285714%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 16.19047619047619%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.2 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 52.38095238095239%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 78.0952380952381%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Checkpoint 120000:
Heads ablated:            [(10, 7), (8, 2), (8, 10)]
Original logit diff:      3.9677882195
Post ablation logit diff: 3.7978868484
Logit diff % change:      -4.28%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 79.04761904761905%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 15.238095238095239%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 17.142857142857142%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 82.85714285714286%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 49.047619047619044%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 14.761904761904763%




Checkpoint 121000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10)]
Original logit diff:      3.8009722233
Post ablation logit diff: 3.3016083241
Logit diff % change:      -13.14%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 40.0%
Copy circuit for head 9.2 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 74.76190476190476%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 20.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 13.333333333333334%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 10.476190476190476%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 14.285714285714285%
Copy circuit for head 10.8 (sign=1) : Top 5 a



Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 122000:
Heads ablated:            [(8, 10), (8, 2), (10, 7), (5, 0)]
Original logit diff:      3.6271793842
Post ablation logit diff: 3.3270494938
Logit diff % change:      -8.27%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 21.428571428571427%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Checkpoint 123000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.5861654282
Post ablation logit diff: 3.4430840015
Logit diff % change:      -3.99%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 76.19047619047619%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 13.80952380952381%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 12.380952380952381%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 22.380952380952383%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 8.10 (si

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 77.14285714285715%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 14.761904761904763%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 22.857142857142858%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 76.66666666666667%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit fo



Checkpoint 125000:
Heads ablated:            [(10, 7), (8, 1), (8, 2), (8, 10)]
Original logit diff:      3.8746874332
Post ablation logit diff: 3.4469809532
Logit diff % change:      -11.04%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 74.76190476190476%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 11.904761904761903%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 8.095238095238095%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 23.809523809523807%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 69.04761904761905%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.11 (sig



Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 11.428571428571429%
Checkpoint 126000:
Heads ablated:            [(5, 0), (8, 2), (8, 10)]
Original logit diff:      4.0825066566
Post ablation logit diff: 4.0230722427
Logit diff % change:      -1.46%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 127000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.4978530407
Post ablation logit diff: 3.6705369949
Logit diff % change:      4.94%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 22.380952380952383%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoint 128000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.6692709923
Post ablation logit diff: 3.6408827305
Logit diff % change:      -0.77%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.38095238095238%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for



Checkpoint 129000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.6639120579
Post ablation logit diff: 3.4995851517
Logit diff % change:      -4.49%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.04761904761905%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 10.952380952380953%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 73.33333333333333%




Checkpoint 130000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.8661262989
Post ablation logit diff: 3.5244510174
Logit diff % change:      -8.84%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 29.04761904761905%
Copy circuit for head 9.2 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 8.571428571428571%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 7.6190476190476195%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 10.476190476190476%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 69.04761904761905%
Copy



Checkpoint 131000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.6959834099
Post ablation logit diff: 3.6061725616
Logit diff % change:      -2.43%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 23.333333333333332%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 3.8095238095238098%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 63.33333333333333%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.8095238095238098%




Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Checkpoint 132000:
Heads ablated:            [(8, 10), (8, 2), (5, 0)]
Original logit diff:      4.1065506935
Post ablation logit diff: 4.0874266624
Logit diff % change:      -0.47%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer




Checkpoint 133000:
Heads ablated:            []
Original logit diff:      3.4976053238
Post ablation logit diff: 3.4976053238
Logit diff % change:      0.00%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.85714285714285%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 27.142857142857142%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 10.952380952380953%
Checkpoint 134000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.6688826084
Post ablation logit diff: 3.5282623768
Logit diff % change:      -3.83%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 72.38095238095238%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 10.0%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 30.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 99.52380952380952%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 28.095238095238095%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 9.047619047619047%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 4.285714285714286%




Checkpoint 135000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.7239780426
Post ablation logit diff: 3.6856148243
Logit diff % change:      -1.03%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 62.38095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 30.476190476190478%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 46.19047619047619%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 95.71428571428572%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 5.238095238095238%
Checkpoint 136000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.8265407085
Post ablation logit diff: 3.9418013096
Logit diff % change:      3.01%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.380952380952381%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 66.19047619047619%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 6.666666666666667%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 25.238095238095237%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 45.714285714285715%
Copy c



Checkpoint 137000:
Heads ablated:            [(5, 0), (8, 2), (8, 10)]
Original logit diff:      3.5678858757
Post ablation logit diff: 3.7611389160
Logit diff % change:      5.42%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 66.19047619047619%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 32.38095238095238%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 95.23809523809523%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 21.428571428571427%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Checkpoint 138000:
Heads ablated:            [(5, 0), (8, 2), (8, 10)]
Original logit diff:      3.8395097256
Post ablation logit diff: 3.8581306934
Logit diff % change:      0.48%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 61.904761904761905%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.238095238095238%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 31.428571428571427%
Copy circuit for head 7.11 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 47.14285714285714%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 96.66666666666667%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.2 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy



Checkpoint 139000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.9237160683
Post ablation logit diff: 3.8220305443
Logit diff % change:      -2.59%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 25.71428571428571%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 98.57142857142858%
Checkpoint 140000:
Heads ablated:            [(8, 10), (8, 2)]
Original logit diff:      3.5421364307
Post ablation logit diff: 3.6935260296
Logit diff % change:      4.27%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 3.3333333333333335%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 62.857142857142854%
Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 30.476190476190478%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 46.19047619047619%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 96.19047619047619%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 24.285714285714285%




Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 4.761904761904762%
Checkpoint 141000:
Heads ablated:            [(8, 2), (8, 10)]
Original logit diff:      3.6798231602
Post ablation logit diff: 3.9012730122
Logit diff % change:      6.02%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 1.4285714285714286%
Copy circuit for head 9.6 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 16.666666666666664%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 95.23809523809523%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 36.666666666666664%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 29.523809523809526%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 0.4761904761904762%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 5.714285714285714%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 4.285714285714286%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 60.476190476190474%
Copy circuit for head 5.0 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for 



Checkpoint 142000:
Heads ablated:            [(8, 10), (8, 2), (5, 0)]
Original logit diff:      4.1640057564
Post ablation logit diff: 4.0420594215
Logit diff % change:      -2.93%


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-160m into HookedTransformer
Copy circuit for head 6.6 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 10.7 (sign=1) : Top 5 accuracy: 66.19047619047619%
Copy circuit for head 10.8 (sign=1) : Top 5 accuracy: 1.9047619047619049%
Copy circuit for head 10.11 (sign=1) : Top 5 accuracy: 7.142857142857142%
Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 4.761904761904762%
Copy circuit for head 9.8 (sign=1) : Top 5 accuracy: 0.9523809523809524%
Copy circuit for head 7.8 (sign=1) : Top 5 accuracy: 33.80952380952381%
Copy circuit for head 8.1 (sign=1) : Top 5 accuracy: 39.523809523809526%
Copy circuit for head 8.2 (sign=1) : Top 5 accuracy: 98.57142857142858%
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 2.857142857142857%
Copy circuit for head 8.10 (sign=1) : Top 5 accuracy: 100.0%
Copy circuit for head 9.4 (sign=1) : Top 5 accuracy: 20.952380952380953%
Copy circuit for head 9.7 (sign=1) : Top 5 accuracy: 2.380952380952381%
Checkpoi

In [49]:
experiment_metrics = torch.load(f'results/backup/{BASE_MODEL}/nmh_backup_metrics.pt')

In [50]:
experiment_metrics

{10000: {'logit_diff': 2.421691417694092,
  'per_head_logit_diffs': array([[-1.91408989e-03,  5.08053578e-04,  1.10229256e-03,
          -5.32193296e-03,  4.81794879e-04, -2.42428086e-03,
          -4.75557381e-03,  6.66759536e-03, -2.47690501e-03,
           2.50994391e-03, -1.95404282e-03, -4.71131457e-03],
         [-1.08297328e-02,  5.00292005e-03,  3.60604352e-03,
          -2.03508302e-03,  2.70822609e-04, -7.41213048e-03,
           4.72147251e-03, -1.63456390e-03, -2.15056818e-03,
          -1.16616171e-02, -2.62599206e-03, -2.21730908e-03],
         [-1.35358865e-03, -5.37946820e-03, -3.86888464e-03,
          -1.04065752e-02, -3.19856056e-03,  2.13863491e-03,
          -9.56851430e-03, -1.56810775e-03, -2.34713836e-04,
           1.28491549e-03, -1.61476852e-03, -1.16304622e-03],
         [-1.02184508e-02, -3.13080754e-03, -1.41960022e-03,
           5.12934895e-03,  7.26311794e-03,  1.29262055e-03,
           5.76168313e-05,  1.14478706e-03,  5.53678954e-03,
           7.476