# Indirect Object Identification Circuit in Pythia

In [2]:

IN_COLAB = False
from IPython import get_ipython

ipython = get_ipython()
# Code to automatically update the HookedTransformer code as its edited without restarting the kernel
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


In [3]:
import os
import pathlib
from typing import List, Optional, Union

import torch
import numpy as np
import yaml

import einops
from fancy_einsum import einsum

from datasets import load_dataset
from transformers import pipeline
import plotly.io as pio
import plotly.express as px
#import pysvelte
from IPython.display import HTML

import plotly.graph_objs as go
import ipywidgets as widgets
from IPython.display import display

# if IN_COLAB or not DEBUG_MODE:
#     # Thanks to annoying rendering issues, Plotly graphics will either show up in colab OR Vscode depending on the renderer - this is bad for developing demos! Thus creating a debug mode.
#     pio.renderers.default = "colab"
# else:
#     pio.renderers.default = "plotly_mimetype+notebook"

if torch.cuda.is_available():
    device = int(os.environ.get("LOCAL_RANK", 0))
else:
    device = "cpu"

In [4]:
import transformers
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
import transformer_lens
import transformer_lens.utils as utils
import transformer_lens.patching as patching
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

from functools import partial

from torchtyping import TensorType as TT

In [4]:
import huggingface_hub
huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fc6644de430>

In [6]:
from neel_plotly import line, imshow, scatter

def l_imshow(tensor, renderer=None, **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", **kwargs).show(renderer)

def l_line(tensor, renderer=None, **kwargs):
    px.line(y=utils.to_numpy(tensor), **kwargs).show(renderer)

def l_scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

def two_lines(tensor1, tensor2, renderer=None, **kwargs):
    px.line(y=[utils.to_numpy(tensor1), utils.to_numpy(tensor2)], **kwargs).show(renderer)

## Final Model Setup

In [7]:
source_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-v1.1-160m", revision="step143000")
model = HookedTransformer.from_pretrained(
    "EleutherAI/pythia-125m",
    hf_model=source_model,
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    #refactor_factored_attn_matrices=True,
)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


## Circuit Definition

In [8]:
from collections import namedtuple
CircuitComponent = namedtuple('CircuitComponent', ['heads', 'position', 'receiver_type'])

circuit = {
    "name-movers": CircuitComponent([(8, 2), (8, 10), (9, 6), (10, 7)], -1, 'hook_q'),
    "s2-inhibition": CircuitComponent([(6, 6), (7, 2), (7, 9)], 10, 'hook_v'),
    #"duplicate-name": CircuitComponent([], 10, 'head_v'),
    #"induction": CircuitComponent([], 10, 'head_v')
}

## Data Setup

In [9]:
prompts = [
    'When John and Mary went to the shops, John gave the bag to',
    'When John and Mary went to the shops, Mary gave the bag to',
    'When Tom and James went to the park, James gave the ball to',
    'When Tom and James went to the park, Tom gave the ball to',
    'When Dan and Sid went to the shops, Sid gave an apple to',
    'When Dan and Sid went to the shops, Dan gave an apple to',
    'After Martin and Amy went to the park, Amy gave a drink to',
    'After Martin and Amy went to the park, Martin gave a drink to'
    ]
    
answers = [
    (' Mary', ' John'), 
    (' John', ' Mary'), 
    (' Tom', ' James'), 
    (' James', ' Tom'), 
    (' Dan', ' Sid'), 
    (' Sid', ' Dan'), 
    (' Martin', ' Amy'), 
    (' Amy', ' Martin')
    ]

clean_tokens = model.to_tokens(prompts)
# Swap each adjacent pair, with a hacky list comprehension
corrupted_tokens = clean_tokens[
    [(i+1 if i%2==0 else i-1) for i in range(len(clean_tokens)) ]
    ]
print("Clean string 0", model.to_string(clean_tokens[0]))
print("Corrupted string 0", model.to_string(corrupted_tokens[0]))

answer_token_indices = torch.tensor([[model.to_single_token(answers[i][j]) for j in range(2)] for i in range(len(answers))], device=model.cfg.device)
print("Answer token indices", answer_token_indices)

Clean string 0 <|endoftext|>When John and Mary went to the shops, John gave the bag to
Corrupted string 0 <|endoftext|>When John and Mary went to the shops, Mary gave the bag to
Answer token indices tensor([[ 6393,  2516],
        [ 2516,  6393],
        [ 6270,  5490],
        [ 5490,  6270],
        [ 5682, 24752],
        [24752,  5682],
        [ 8698, 22138],
        [22138,  8698]], device='cuda:0')


## Tool Setup

### Visualization

In [10]:
import plotly.graph_objs as go
import torch
import ipywidgets as widgets
from IPython.display import display

def visualize_tensor(tensor, labels, zmin=-1.0, zmax=1.0):
    assert len(labels) == tensor.shape[-1], "The number of labels should match the number of slices in the tensor."
    def plot_slice(selected_slice):
        fig = go.FigureWidget(
            data=go.Heatmap(
                z=tensor[:,:, selected_slice].numpy(),
                zmin=zmin,
                zmax=zmax, 
                colorscale='RdBu'),
            layout=go.Layout(
                title=f'Slice: {selected_slice} - Step: {labels[selected_slice]}',
                yaxis=dict(autorange='reversed')
                )
        )
        return fig

    def on_slider_change(change):
        selected_slice = change['new']
        fig = plot_slice(selected_slice)
        output.clear_output(wait=True)
        with output:
            display(fig)

    slider = widgets.IntSlider(min=0, max=tensor.shape[2]-1, step=1, value=0, description='Slice:')
    slider.observe(on_slider_change, names='value')
    display(slider)

    output = widgets.Output()
    display(output)

    with output:
        display(plot_slice(0))


### Activation Patching

In [11]:
def get_logit_diff(logits, answer_token_indices=answer_token_indices):
    if len(logits.shape)==3:
        # Get final logits only
        logits = logits[:, -1, :]
    correct_logits = logits.gather(1, answer_token_indices[:, 0].unsqueeze(1))
    incorrect_logits = logits.gather(1, answer_token_indices[:, 1].unsqueeze(1))
    return (correct_logits - incorrect_logits).mean()

clean_logits, clean_cache = model.run_with_cache(clean_tokens)
corrupted_logits, corrupted_cache = model.run_with_cache(corrupted_tokens)

clean_logit_diff = get_logit_diff(clean_logits, answer_token_indices).item()
print(f"Clean logit diff: {clean_logit_diff:.4f}")

corrupted_logit_diff = get_logit_diff(corrupted_logits, answer_token_indices).item()
print(f"Corrupted logit diff: {corrupted_logit_diff:.4f}")

Clean logit diff: 4.7129
Corrupted logit diff: -4.7129


In [12]:
CLEAN_BASELINE = clean_logit_diff
CORRUPTED_BASELINE = corrupted_logit_diff
def ioi_metric(logits, clean_baseline=CLEAN_BASELINE, corrupted_baseline=CORRUPTED_BASELINE, answer_token_indices=answer_token_indices):
    return (get_logit_diff(logits, answer_token_indices) - corrupted_baseline) / (clean_baseline  - corrupted_baseline)

clean_baseline_ioi = ioi_metric(clean_logits, CLEAN_BASELINE, CORRUPTED_BASELINE)
corrupted_baseline_ioi = ioi_metric(corrupted_logits, CLEAN_BASELINE, CORRUPTED_BASELINE)

print(f"Clean Baseline is 1: {ioi_metric(clean_logits, CLEAN_BASELINE, CORRUPTED_BASELINE).item():.4f}")
print(f"Corrupted Baseline is 0: {ioi_metric(corrupted_logits, CLEAN_BASELINE, CORRUPTED_BASELINE).item():.4f}")

Clean Baseline is 1: 1.0000
Corrupted Baseline is 0: 0.0000


In [13]:
# Whether to do the runs by head and by position, which are much slower
DO_SLOW_RUNS = True

### Path Patching

In [14]:
def patch_pos_head_vector(
    orig_head_vector: TT["batch", "pos", "head_index", "d_head"],
    hook,
    pos, 
    head_index, 
    patch_cache):
    #print(patch_cache.keys())
    orig_head_vector[:, pos, head_index, :] = patch_cache[hook.name][:, pos, head_index, :]
    return orig_head_vector

def patch_head_vector(
    orig_head_vector: TT["batch", "pos", "head_index", "d_head"],
    hook,
    head_index, 
    patch_cache):
    orig_head_vector[:, :, head_index, :] = patch_cache[hook.name][:, :, head_index, :]
    return orig_head_vector

In [15]:
def path_patching(
    model,
    patch_tokens,
    orig_tokens,
    sender_heads,
    receiver_hooks,
    positions=-1,
):
    """
    Patch in the effect of `sender_heads` on `receiver_hooks` only
    (though MLPs are "ignored" if `freeze_mlps` is False so are slight confounders in this case - see Appendix B of https://arxiv.org/pdf/2211.00593.pdf)

    TODO fix this: if max_layer < model.cfg.n_layers, then let some part of the model do computations (not frozen)
    """

    def patch_positions(z, source_act, hook, positions=["end"], verbose=False):
        for pos in positions:
            z[torch.arange(orig_tokens.N), orig_tokens.word_idx[pos]] = source_act[
                torch.arange(patch_tokens.N), patch_tokens.word_idx[pos]
            ]
        return z

    # process arguments
    sender_hooks = []
    for layer, head_idx in sender_heads:
        if head_idx is None:
            sender_hooks.append((f"blocks.{layer}.hook_mlp_out", None))

        else:
            sender_hooks.append((f"blocks.{layer}.attn.hook_z", head_idx))

    sender_hook_names = [x[0] for x in sender_hooks]
    receiver_hook_names = [x[0] for x in receiver_hooks]
    receiver_hook_heads = [x[1] for x in receiver_hooks]
    # Forward pass A (in https://arxiv.org/pdf/2211.00593.pdf)
    source_logits, sender_cache = model.run_with_cache(patch_tokens)

    # Forward pass B
    target_logits, target_cache = model.run_with_cache(orig_tokens)

    # Forward pass C
    # Cache the receiver hooks
    # (adding these hooks first means we save values BEFORE they are overwritten)
    receiver_cache = model.add_caching_hooks(lambda x: x in receiver_hook_names)

    # "Freeze" intermediate heads to their orig_tokens values
    # q, k, and v will get frozen, and then if it's a sender head, this will get undone
    # z, attn_out, and the MLP will all be recomputed and added to the residual stream
    # however, the effect of the change on the residual stream will be overwritten by the
    # freezing for all non-receiver components
    pass_c_hooks = []
    for layer in range(model.cfg.n_layers):
        for head_idx in range(model.cfg.n_heads):
            for hook_template in [
                "blocks.{}.attn.hook_q",
                "blocks.{}.attn.hook_k",
                "blocks.{}.attn.hook_v",
            ]:
                hook_name = hook_template.format(layer)
                if (hook_name, head_idx) not in receiver_hooks:
                    #print(f"Freezing {hook_name}")
                    hook = partial(
                        patch_head_vector,
                        head_index=head_idx,
                        patch_cache=target_cache
                    )
                    pass_c_hooks.append((hook_name, hook))
                else:
                    pass
                    #print(f"Not freezing {hook_name}")

    # These hooks will overwrite the freezing, for the sender heads
    # We also carry out pass C
    for hook_name, head_idx in sender_hooks:
        assert not torch.allclose(sender_cache[hook_name], target_cache[hook_name]), (
            hook_name,
            head_idx,
        )
        hook = partial(
            patch_pos_head_vector,
            pos=positions,
            head_index=head_idx,
            patch_cache=sender_cache
        )
        pass_c_hooks.append((hook_name, hook))
  
    receiver_logits = model.run_with_hooks(orig_tokens, fwd_hooks=pass_c_hooks)
    # Add (or return) all the hooks needed for forward pass D
    pass_d_hooks = []

    for hook_name, head_idx in receiver_hooks:
        #for pos in positions:
            # if torch.allclose(
            #     receiver_cache[hook_name][torch.arange(orig_tokens.N), orig_tokens.word_idx[pos]],
            #     target_cache[hook_name][torch.arange(orig_tokens.N), orig_tokens.word_idx[pos]],
            # ):
            #     warnings.warn("Torch all close for {}".format(hook_name))
        hook = partial(
            patch_pos_head_vector,
            pos=positions,
            head_index=head_idx,
            patch_cache=receiver_cache
        )
        pass_d_hooks.append((hook_name, hook))

    return pass_d_hooks
    

### Circuit Component Evaluation

#### NMHs: Copy Score

In [None]:
def check_copy_circuit(model, layer, head, clean_tokens, verbose=False, neg=False):
    
    # Collect the cached activations for the residual stream after layer 0
    original_logits, cache = model.run_with_cache(clean_tokens)
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].ln1(cache["blocks.0.hook_resid_post"])

    # Multiply the residual stream after the first layer with the value matrix in
    # the given layer and head
    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    # Further multiply resulting V with the O weights
    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    
    # Turn output into logits
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    # Here, we check if the top k predictions for the IO, S, and S2 tokens
    # contain the correct token
    for seq_idx, prompt in enumerate(prompts):
        # Check if the top k predictions for the IO, S, and S2 tokens
        # contain the correct token
        for word in ["IO", "S", "S2"]:
            # Get the top k predictions for the word
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            if "S" in word:
                name = "S"
            else:
                name = word
            if " " + prompt[name] in pred_tokens:
                n_right += 1
            else:
                pass
                
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    )
    return percent_right

## Model Setup

In [16]:
def load_model(model_hf_name, model_tl_name, revision, cache_dir=None):

    if cache_dir==None:
        cache_dir = f"/media/curttigges/project-files/projects/circuits/pythia_model/{revision}"

    # Download model from HuggingFace
    source_model = AutoModelForCausalLM.from_pretrained(
        model_hf_name,
        revision=revision,
        cache_dir=cache_dir
    )

    # Load model into TransformerLens
    model = HookedTransformer.from_pretrained(
        model_tl_name,
        hf_model=source_model,
        center_unembed=True,
        center_writing_weights=True,
        fold_ln=True,
    )

    return model

## Circuit Metrics Over Time

In [17]:
def get_path_patching_results(model, clean_baseline, corrupted_baseline, receiver_heads, receiver_type="hook_q", sender_heads=None, position=-1):

    metric_delta_results = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device='cuda:0')

    for layer in range(model.cfg.n_layers):
            for head_idx in range(model.cfg.n_heads):
                pass_d_hooks = path_patching(
                    model=model,
                    patch_tokens=corrupted_tokens,
                    orig_tokens=clean_tokens,
                    sender_heads=[(layer, head_idx)],
                    receiver_hooks=[(f"blocks.{layer_idx}.attn.{receiver_type}", head_idx) for layer_idx, head_idx in receiver_heads],
                    positions=position
                )
                path_patched_logits = model.run_with_hooks(clean_tokens, fwd_hooks=pass_d_hooks)
                iot_metric_res = ioi_metric(path_patched_logits, clean_baseline, corrupted_baseline)
                metric_delta_results[layer, head_idx] = -(clean_baseline_ioi - iot_metric_res) / clean_baseline_ioi
    return metric_delta_results

In [18]:
def ablate_top_head_hook(z: TT["batch", "pos", "head_index", "d_head"], hook, head_idx=0):
    z[:, -1, head_idx, :] = 0
    return z

In [19]:
def get_knockout_perf_drop(model, heads_to_ablate, clean_baseline, corrupted_baseline):    
    # Adds a hook into global model state
    for layer, head in heads_to_ablate:
        ablate_head_hook = partial(ablate_top_head_hook, head_idx=head)
        model.blocks[layer].attn.hook_z.add_hook(ablate_head_hook)

    ablated_logits, ablated_cache = model.run_with_cache(clean_tokens)
    ablated_ioi_metric = ioi_metric(ablated_logits, clean_baseline=clean_baseline, corrupted_baseline=corrupted_baseline)

    return ablated_ioi_metric

In [20]:
def get_chronological_circuit_data(
    model_hf_name, 
    model_tl_name, 
    start_ckpt, 
    end_ckpt, 
    ckpt_interval,
    metric, 
    circuit
):  
    ckpt_count = (end_ckpt - start_ckpt) / ckpt_interval
    metric_vals = []
    attn_head_vals = []
    value_patch_vals = []
    circuit_vals = {key:[] for key in circuit.keys()}
    activation_patching_vals = {key:[] for key in circuit.keys()}
    knockout_drops = {key:[] for key in circuit.keys()}
    # Loop through all checkpoints in range, getting metrics for each
    #for ckpt in range(start_ckpt, end_ckpt, ckpt_interval):

    # Powers of 2 up to 143000, rounded to the nearest thousand after 1000
    #ckpts = [round((2 ** i) / 1000) * 1000 if 2 ** i > 1000 else 2 ** i for i in range(18)]
    ckpts = [2 ** i for i in range(10)] + [i * 1000 for i in range(1, 144)]
    for ckpt in ckpts:

        # Get model
        model = load_model(model_hf_name, model_tl_name, f"step{ckpt}")

        # Get metric values (relative to final performance)
        clean_logits, clean_cache = model.run_with_cache(clean_tokens)
        corrupted_logits, corrupted_cache = model.run_with_cache(corrupted_tokens)

        clean_logit_diff = get_logit_diff(clean_logits, answer_token_indices).item()
        corrupted_logit_diff = get_logit_diff(corrupted_logits, answer_token_indices).item()

        metric = partial(metric, clean_baseline=CLEAN_BASELINE, corrupted_baseline=CORRUPTED_BASELINE)
        metric_val = metric(clean_logits)
        metric_vals.append(metric_val)

        metric = partial(metric, clean_baseline=clean_logit_diff, corrupted_baseline=corrupted_logit_diff)

        # Get attention pattern patching metrics
        attn_head_out_all_pos_act_patch_results = patching.get_act_patch_attn_head_pattern_all_pos(model, corrupted_tokens, clean_cache, metric)
        attn_head_vals.append(attn_head_out_all_pos_act_patch_results)

        # Get value patching metrics
        value_patch_results = patching.get_act_patch_attn_head_v_all_pos(model, corrupted_tokens, clean_cache, metric)
        value_patch_vals.append(value_patch_results)

        # Get path patching metrics for specific circuit parts
        for key in circuit.keys():
            # Get path patching results
            path_patching_results = get_path_patching_results(
                model,
                clean_logit_diff,
                corrupted_logit_diff, 
                circuit[key].heads,
                receiver_type=circuit[key].receiver_type,
                position=circuit[key].position)
            circuit_vals[key].append(path_patching_results)

            # Get knockout performance drop
            knockout_drops[key].append(get_knockout_perf_drop(model, circuit[key].heads, clean_logit_diff, corrupted_logit_diff))

    return torch.tensor(metric_vals), torch.stack(attn_head_vals, dim=-1), torch.stack(value_patch_vals, dim=-1), circuit_vals, knockout_drops


In [160]:
overall_perf, attn_head_perf, value_perf, circuit_vals, knockout_drops = get_chronological_circuit_data(
    "EleutherAI/pythia-v1.1-160m",
    "EleutherAI/pythia-125m",
    start_ckpt=1000,
    end_ckpt=50000,
    ckpt_interval=1000,
    metric=ioi_metric,
    circuit=circuit
    )

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)step3000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)step5000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)step6000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)step7000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)step9000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep10000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep11000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep12000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep13000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep14000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep15000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep17000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep18000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep19000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep20000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep21000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep22000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep23000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep24000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep25000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep26000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep27000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep28000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep29000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep30000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep31000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep32000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep34000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep35000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep36000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep37000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep38000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep39000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep40000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep41000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep42000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep43000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep44000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep45000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep46000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep47000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep48000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep49000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep50000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep51000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep52000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep53000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep54000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep55000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep56000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep57000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep58000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep59000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep60000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep61000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep62000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep63000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep64000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep65000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep67000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep68000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep69000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep70000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep71000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep72000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep73000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep74000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep75000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep76000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep77000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep78000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep79000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep80000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep81000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep82000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep83000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep84000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep85000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep86000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep87000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep88000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep89000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep90000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep91000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep92000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep93000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep94000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep95000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep96000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep97000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep98000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)tep99000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep100000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep101000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep102000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep103000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep104000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep105000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep106000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep107000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep108000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep109000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep110000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep111000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep112000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep113000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep114000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep115000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep116000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep117000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep118000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep119000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep120000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep121000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep122000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep123000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep124000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep125000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep126000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep127000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep128000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep129000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep130000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep132000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep133000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep134000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep135000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep136000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep137000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep138000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep139000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep140000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep141000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep142000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Downloading (…)ep143000/config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/375M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-125m into HookedTransformer


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

In [21]:
#ckpts = [round((2 ** i) / 1000) * 1000 if 2 ** i > 1000 else 2 ** i for i in range(18)]
ckpts = [2 ** i for i in range(10)] + [i * 1000 for i in range(1, 144)]
#ckpts

In [162]:
import pickle

torch.save(overall_perf, "results/pythia-160m-v1.1-no-dropout/overall_perf.pt")
torch.save(value_perf, "results/pythia-160m-v1.1-no-dropout/value_perf.pt")
torch.save(attn_head_perf, "results/pythia-160m-v1.1-no-dropout/attn_head_perf.pt")
with open("results/pythia-160m-v1.1-no-dropout/circuit_vals.pkl", "wb") as f:
    pickle.dump(circuit_vals, f)
with open("results/pythia-160m-v1.1-no-dropout/knockout_drops.pkl", "wb") as f:
    pickle.dump(knockout_drops, f)

## Results

In [22]:
import pickle
# load the saved data
overall_perf = torch.load("results/pythia-160m-v1.1-no-dropout/overall_perf.pt")
attn_head_perf = torch.load("results/pythia-160m-v1.1-no-dropout/attn_head_perf.pt")
value_perf = torch.load("results/pythia-160m-v1.1-no-dropout/value_perf.pt")
with open("results/pythia-160m-v1.1-no-dropout/circuit_vals.pkl", "rb") as f:
    circuit_vals = pickle.load(f)
with open("results/pythia-160m-v1.1-no-dropout/knockout_drops.pkl", "rb") as f:
    knockout_drops = pickle.load(f)

In [23]:
ckpts[15]

6000

### Overall Performance

In [24]:
line(overall_perf, title="IOI Metric Performance Over Training Time (Log Scale)", x=ckpts, log_x=True)

### Name Mover Performance

In [25]:
line(torch.stack(knockout_drops['name-movers']).cpu(), title="IOI Metric Performance After Name Mover Knockout Relative to Step Performance (Log Scale)", x=ckpts, log_x=True)

In [30]:
visualize_tensor(attn_head_perf.cpu(), ckpts, zmin=-0.5, zmax=0.5)

IntSlider(value=0, description='Slice:', max=152)

Output()

### S2-Inhibition Performance

In [26]:
l_line(torch.stack(knockout_drops['s2-inhibition']).cpu(), title="IOI Metric Performance After S2-Inhibition Head Knockout Relative to Step Performance (Log Scale)", x=ckpts, log_x=True)

In [28]:
visualize_tensor(value_perf.cpu(), ckpts, zmin=-0.5, zmax=0.5)

In [29]:
visualize_tensor(torch.stack(circuit_vals["name-movers"], dim=-1).cpu(), ckpts, zmin=-0.4, zmax=0.4)

IntSlider(value=0, description='Slice:', max=152)

Output()

### Duplicate Name & Induction Heads

In [171]:
visualize_tensor(torch.stack(circuit_vals["s2-inhibition"], dim=-1).cpu(), ckpts, zmin=-0.15, zmax=0.15)

IntSlider(value=0, description='Slice:', max=152)

Output()