In [1]:
!pip install torch
!pip install huggingface_hub

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.me

In [1]:
from sae import load_sae
from huggingface_hub import hf_hub_download
import torch

file_path = hf_hub_download(
    repo_id="Goodfire/Hackathon-gpt-oss-20b-SAE-l15",
    filename="topk_sae_l15_exp16_k32.pt",
    repo_type="model"
)

device = "cuda" if torch.cuda.is_available() else "cpu"
sae = load_sae(file_path, device, torch.float32)

In [4]:
sae

TopKSAE(
  (encoder_linear): Linear(in_features=2880, out_features=46080, bias=False)
  (decoder): Embedding(46080, 2880)
)

In [8]:
x = torch.randn(1, 2880).to(device)
f = sae.encode(x)
print(f)

x_hat = sae.decode(f)
print(x_hat.shape)

torch.return_types.topk(
values=tensor([[4.2541, 4.1714, 3.8353, 3.5973, 3.5724, 3.5609, 3.4909, 3.4741, 3.4395,
         3.4331, 3.4131, 3.4072, 3.3891, 3.3821, 3.3126, 3.3058, 3.2795, 3.2595,
         3.2311, 3.2204, 3.2190, 3.1510, 3.1376, 3.1132, 3.1037, 3.0554, 3.0333,
         3.0318, 3.0195, 3.0117, 3.0117, 2.9994]], device='cuda:0',
       grad_fn=<TopkBackward0>),
indices=tensor([[28863, 18118,  8643, 18829, 32025, 30785, 24263, 15114, 28406, 14496,
         20061,  5210, 34951, 35550, 12878, 14339, 32027, 42351, 17683, 18138,
         37700,  4499, 34537, 21550, 18191,  9191, 38523, 16228, 11765, 14591,
         30525, 27080]], device='cuda:0'))
torch.Size([1, 2880])


# Using with gpt-oss 20b

In [6]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name ="openai/gpt-oss-20b"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    attn_implementation="kernels-community/vllm-flash-attn3",
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)



Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# Prompt
prompt = "Life is a marathon, not a sprint."

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
enc = {k: v.to(model.device) for k, v in enc.items()}
str_tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0])

# Grab the transformer layers container and pick layer 15 (0-indexed)
def get_layers_container(m):
    for path in [
        "model.gpt_neox.layers", "gpt_neox.layers", "model.layers",
        "model.transformer.h", "transformer.h", "model.decoder.layers", "decoder.layers",
    ]:
        cur = m
        ok = True
        for p in path.split("."):
            if not hasattr(cur, p):
                ok = False; break
            cur = getattr(cur, p)
        if ok and isinstance(cur, (torch.nn.ModuleList, list)):
            return cur
    raise RuntimeError("layers container not found")

layers = get_layers_container(model)
target_layer = layers[15]

# Capture activations at layer 15
captures = {}
def hook_fn(module, inputs, output):
    hs = output[0] if isinstance(output, tuple) else output
    captures["acts"] = hs.detach()

h = target_layer.register_forward_hook(hook_fn)
with torch.no_grad():
    _ = model(**enc)
h.remove()

# Pass through SAE and show active features for each token
acts = captures["acts"]
B, T, H = acts.shape
flat = acts.reshape(B * T, H).to(dtype=torch.float32, device=sae.decoder.weight.device)

with torch.no_grad():
    topk = sae.encode(flat)  # returns torch.return_types.topk with .values and .indices

idx = topk.indices.reshape(B, T, -1)[0].cpu()
val = topk.values.reshape(B, T, -1)[0].cpu()

# Feature 1340 activates on the token 'marathon' in the context of "marathon, not a sprint"
for t, tok in enumerate(str_tokens):
    feats = list(zip(idx[t].tolist(), [round(v, 3) for v in val[t].tolist()]))
    print(f"{t:02d} {tok}: {feats}")

00 Life: [(31259, 15505.759), (40696, 9913.289), (46030, 9502.822), (31435, 7052.571), (24613, 6508.846), (11636, 6375.023), (5005, 6370.8), (21550, 6172.671), (26632, 6044.3), (25192, 5627.713), (37743, 5416.535), (16444, 5147.6), (23035, 5097.8), (26550, 4979.968), (2560, 4919.771), (3813, 4912.863), (44863, 4894.991), (37970, 4835.599), (3725, 4807.5), (12883, 4713.53), (2512, 4691.869), (7045, 4686.243), (20137, 4683.039), (38371, 4621.383), (12531, 4611.111), (13284, 4561.846), (14634, 4561.51), (43669, 4551.993), (14926, 4486.74), (10362, 4437.937), (43273, 4290.663), (25254, 4270.169)]
01 Ġis: [(6991, 2341.818), (4576, 814.181), (37972, 432.386), (18118, 427.344), (46030, 406.089), (21909, 305.816), (34689, 269.797), (10856, 262.572), (1687, 259.726), (28731, 255.91), (24657, 240.407), (31259, 230.828), (14679, 218.373), (17495, 189.278), (21952, 188.938), (20557, 188.557), (37880, 182.809), (34312, 179.551), (2901, 176.749), (37700, 171.254), (9545, 168.087), (20111, 164.419), 