In [None]:
import gc
from pathlib import Path
import pickle
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

project_root = str(Path().resolve().parent)
sys.path.insert(0, project_root)

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from src.prep.embed import embed
from src.prep.tokenization import tokenize

In [None]:
MODEL_ID = "AlphaGaO/Qwen3-4B-GPTQ"
DATASET_PATH = f"{project_root}/data/dataset.csv"
TOKENS_PATH = f"{project_root}/data/tokens.pkl"
HIDDEN_STATES_PATH = "{project_root}/data/activations/layer{num}.pkl"
MAX_LEN = 768
KV_CAHCE_SIZE = 896
LAYERS = [35, 36]

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    MAPPING = "cuda:0"
else:
    DEVICE = torch.device("cpu")
    MAPPING = "auto"

print("{} device is available".format(DEVICE))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map=MAPPING,
    dtype=torch.float16,
    attn_implementation="flash_attention_2"
).eval().to(DEVICE)

texts = pd.read_csv(DATASET_PATH).prompt.tolist()

In [None]:
hidden_states = embed(
    texts,
    model, tokenizer,
    layers=LAYERS,
    kv_cache_size=KV_CAHCE_SIZE, max_length=MAX_LEN
)

Transform the cumulative embeddings (activations + residual connections) of successive layers into pure activation increments by subtracting the previous layer from each layer and discarding the initial first layer, since there is no difference for it.

In [None]:
activations = []

for text_result in hidden_states:
    # Process consecutive layers starting from the second one
    for i in range(1, len(LAYERS)):
        current_layer = LAYERS[i]
        prev_layer = LAYERS[i-1]

        # Subtract previous layer from current layer
        current_tensor = text_result[current_layer].to(torch.float16)
        prev_tensor = text_result[prev_layer].to(torch.float16)

        activation = current_tensor - prev_tensor
        activations.append(activation)

Release memory from two layers of residual connections so that when saving activations due to overhead costs, no OOM errors occur and, in general, to save memory

In [None]:
del hidden_states

gc.collect()

Simply save obtained hidden states to restore definitely current data structure later

In [None]:
with open(HIDDEN_STATES_PATH.format(project_root=project_root, num="36"), 'wb') as f:
    pickle.dump(activations, f)

Release memory again, but now only from activations of a single layer

In [None]:
del activations

gc.collect()

Split prompts into human-readable tokens using multiprocessing (physical cores only)

In [None]:
tokens = tokenize(texts, tokenizer, batch_size=4342, on_error='keep', show_progress=True, num_workers=6)

As we can see, **~700,000 tokens** were generated.

---

On the one hand, this is very little for training Sparse Autoencoders, since **billions or even trillions of tokens were used in well-known works** by Anthropic and DeepMind. Moreover, the dimension of the hidden layers in `Qwen3` is 2560 neurons, unlike `GPT-2`-like architectures with 512 neurons that trained Anthropic. Finally, it is worth noting that the data is semantically homogeneous, so the traditional “overcomplete” approach for Sparse Autoencoders can lead to:
- excessive sparsity;
- a large number of “dead” neurons;
- an increased $L_0$ norm;
- weak convergence during optimization.

However, we can **manually select** only the most significant neurons that will be monosemantic and significantly influence the reconstruction. For a similar group of neurons **data homogeneity can play a positive role**, but **most neurons risk being underfitted**  due to the limited set of concepts that can fit into only ~700,000 tokens.

In [None]:
with open(TOKENS_PATH, 'wb') as f:
    pickle.dump(tokens, f)