In [10]:
import torch
from tqdm.notebook import tqdm
import gc

from aelib import ActivationsBuffer, ActivationsBufferConfig
from aelib.multilayer import AutoEncoderMultiLayerConfig, AutoEncoderMultiLayerTrainer, \
    AutoEncoderMultiLayerTrainerConfig
from aelib.sweeper import AutoEncoderSweeper, AutoEncoderSweeperConfig

In [2]:
version = 1
expansion_factor = 32
wb_project = "multilayer_sae"
wb_entity = "collingray"
wb_name = f"pythia_wikitext_{expansion_factor}x_v{version}"
wb_group = "pythia_wikitext"

model_name = "EleutherAI/pythia-70m"
n_layers = 6
n_dim = 512
dataset_name = "wikitext"
dataset_config = "wikitext-103-v1"
dataset_split = "train"

In [3]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7d65641b2690>

In [4]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_SILENT=true

env: TOKENIZERS_PARALLELISM=false
env: WANDB_SILENT=true


In [5]:
layers = list(range(n_layers))

# Buffer to easily generate/store activations from the model, automatically refills the buffer when it drops below a certain size,
# and then shuffles it to avoid activations from the same sequence being used together
buffer_cfg = ActivationsBufferConfig(
    model_name=model_name,
    layers=layers,
    dataset_name=dataset_name,
    dataset_config=dataset_config,
    dataset_split=dataset_split,
    act_site="hook_mlp_out",
    device="cuda",
    buffer_device="cpu",
    buffer_size=2 ** 21,
    min_capacity=2 ** 17,
    model_batch_size=2048,
    samples_per_seq=256,
    max_seq_length=256,
    seed=seed
)

buffer = ActivationsBuffer(buffer_cfg)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [6]:
total_activations = int(5e8)
batch_size = 512

autoencoder_cfg = AutoEncoderMultiLayerConfig(
    n_dim=n_dim,
    m_dim=n_dim * expansion_factor,
    act_norms=n_layers,
    act_renorm_type="none",
    record_data=True,
    save_dir="./weights",
    seed=seed,
)

autoencoder_trainer_cfg = AutoEncoderMultiLayerTrainerConfig(
    lr=4e-3,
    beta1=0.9,
    beta2=0.999,
    l1_weight=1,
    total_steps=total_activations // batch_size,
    lr_warmup_pct=0,
    lr_decay_pct=0.2,
    l1_warmup_pct=None,
    steps_per_report=2 ** 20 // batch_size,
    decoder_norm_scale=True,
    wb_project=wb_project,
    wb_entity=wb_entity,
    wb_name=wb_name,
    wb_group=wb_group,
    wb_config=autoencoder_cfg.__dict__,
)

autoencoder_trainer = AutoEncoderMultiLayerTrainer(autoencoder_cfg, autoencoder_trainer_cfg)

In [7]:
step = 0

In [None]:
try:
    for s in tqdm(range(total_activations // batch_size), initial=step, total=total_activations // batch_size):
        acts = buffer.next(batch=batch_size).to(autoencoder_cfg.device, dtype=autoencoder_cfg.dtype)
        autoencoder_trainer.train_on(acts, buffer)
finally:
    step = s    

  0%|          | 0/976562 [00:00<?, ?it/s]

In [None]:
sweeper_cfg = AutoEncoderSweeperConfig(
    n_dim=n_dim,
    m_dim=n_dim * expansion_factor,
    lr=4e-3,
    beta1=0.9,
    beta2=0.999,
    l1_weight=1,
    lr_warmup_pct=0,
    lr_decay_pct=0.2,
    l1_warmup_pct=None,
    
    act_norms=n_layers,
    act_renorm_type="none",

NameError: name 's' is not defined