In [10]:
import sys
sys.path.append('../src/')

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from tqdm import tqdm
from sae_model import SparseAutoencoder
import json
import os

models_dir = "../models/"

# Load model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained("gpt2", attn_implementation="eager")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set to evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
LAYER_NUM = 7

# Hook setup
activations = {}
def get_activation(name):
    def hook(model, input, output):
        activations[name] = output[0].detach()
    return hook

handle = model.transformer.h[LAYER_NUM].register_forward_hook(
    get_activation(f'layer_{LAYER_NUM}')
)

In [11]:
checkpoint = torch.load(models_dir + "sae_topk_final.pt", map_location='cpu')

# Extract config if saved, otherwise use defaults
if 'config' in checkpoint:
    config = checkpoint['config']
    sae = SparseAutoencoder(
        input_dim=config['input_dim'],
        hidden_dim=config['hidden_dim'],
        sparsity_coef=config['k'],  # k is the number of active features
        normalize_eps=config.get('normalize_eps', 1e-6)
    )
    state_dict = checkpoint['model_state_dict']
else:
    # Fallback if config not saved
    sae = SparseAutoencoder(input_dim=768, hidden_dim=3840, sparsity_coef=64)
    state_dict = checkpoint if isinstance(checkpoint, dict) and 'encoder.weight' in checkpoint else checkpoint['model_state_dict']

sae.load_state_dict(state_dict)
sae.eval()

SparseAutoencoder(
  (encoder): Linear(in_features=768, out_features=3840, bias=True)
  (decoder): Linear(in_features=3840, out_features=768, bias=False)
)

In [12]:
text = "Testing visually to see if the sparse encoder actually works"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)


In [17]:
outputs = model(**inputs)
act = activations[f'layer_{LAYER_NUM}'][0]  # [seq_len, 768]

In [19]:
act.shape

torch.Size([11, 768])