#### Self-organizing LLM wrapper

In [9]:
import torch
from model.model_sorl import SorlModelWrapper
from model.model_sorl import infer_level
from model.model_minimind import MiniMindConfig

full_vocab_list = [11, 50] # Base vocab + abstract vocabs
model = SorlModelWrapper.from_scratch(
    config=MiniMindConfig(vocab_size=sum(full_vocab_list)), # Config needs the total new vocab size
    full_vocab_size_list=full_vocab_list,
    memory_span=5,
    pad_token_id=0
)
# --- Generate text using the custom SORL logic ---

prompt = torch.tensor([[1, 2, 3]])
generated_sequence = model.generate(
    input_ids=prompt,
    max_new_tokens=50,
    temperature=0.0,
    top_k=50,
    force_abstraction_every_n=4  # Example: force an abstraction token every 10 steps
)

print("--- SORL Generation Results ---")
print("Base vocabulary size:", model.vocab_sizes[0].item())
print("Total vocabulary size:", model.model.config.vocab_size)
print("\nGenerated Sequence:", generated_sequence)


result = model.forward(prompt)
print("\n--- Forward propagation (sparse attention) ---")
print("result.logits.shape: ", result.logits.shape)


orig_tokens = torch.tensor([[1,2,3,61,2,4,1,61,3,4,2,61]])

levels = infer_level(orig_tokens, model.vocab_sizes, -1)
denoise_mask = torch.isin(orig_tokens, model.level_mask_tokens[1:])
denoise_levels = levels[denoise_mask]

new_tokens = model.denoise(orig_tokens, denoise_mask, denoise_levels, 0.0)
print("\n--- Denoising ---")
print(f"Generating 2 level-1 tokens in parallel: {orig_tokens[0].tolist()} --> {new_tokens[0].tolist()}")

--- SORL Generation Results ---
Base vocabulary size: 11
Total vocabulary size: 62

Generated Sequence: tensor([[41, 17, 41, 41, 17, 41, 41, 17, 41, 17, 53,  1,  1,  1, 53,  1,  1]])

--- Forward propagation (sparse attention) ---
result.logits.shape:  torch.Size([1, 3, 62])

--- Denoising ---
Generating 2 level-1 tokens in parallel: [1, 2, 3, 61, 2, 4, 1, 61, 3, 4, 2, 61] --> [1, 2, 3, 60, 2, 4, 1, 16, 3, 4, 2, 17]


In [1]:
import torch
from model.model_sorl import SorlModelWrapper
from model.model_minimind import MiniMindConfig
# from model.model_sorl import 
from src.sorl import SORLConfig, sorl_search, compute_per_token_loss, compute_loss, evaluate

# --- 1. Setup the Model and Configuration ---
print("="*80)
print("--- Initializing Model and SORL Configuration ---")
print("="*80)

# Initialize a SORL-wrapped MiniMind model from scratch for the test
base_vocab_size = 512
abstract_vocab_sizes = [128]
full_vocab_list = [base_vocab_size] + abstract_vocab_sizes

minimind_config = MiniMindConfig(
    hidden_size=64, # Using smaller dimensions for faster testing
    num_attention_heads=2,
    num_hidden_layers=2,
    intermediate_size=128,
    vocab_size=sum(full_vocab_list)
)
sorl_model = SorlModelWrapper(
    config=minimind_config,
    full_vocab_size_list=full_vocab_list,
    memory_span=1024
)

# Create a configuration for the SORL search algorithm
# These parameters control how abstraction is performed
sorl_config = SORLConfig(
    n=4,                    # Number of candidates to roll out
    temperature=1.0,        # Temperature for sampling abstract tokens
    K=8,                    # Rhythmic stride for level-1 abstraction
    l=1,                    # The abstraction level to search for
    steps=4,                # Steps for chunk-wise denoising
    max_t_search=32,        # Max number of abstract timestamps to search within
    use_rhythmic_placeholders=True,
    use_spike_placeholders=False # Disable spike for simplicity in this test
)

print(f"Model Initialized. Total vocabulary size: {sorl_model.model.config.vocab_size}")
print(f"SORL Config: {sorl_config}\n")


# --- 2. Create Dummy Data ---
batch_size = 2
seq_len = 128
# Create a batch of random token sequences
dummy_data = torch.randint(0, base_vocab_size, (batch_size, seq_len), device=sorl_model.model.device)
print(f"Created dummy data with shape: {dummy_data.shape}\n")


# --- 3. Test the `sorl_search` Function ---
print("="*80)
print("--- Testing `sorl_search` ---")
print("="*80)

# This is the core of the SORL algorithm. It takes the original data and
# finds a better representation by inserting abstract tokens.
with torch.no_grad():
    best_sequence, switch_ratio = sorl_search(dummy_data, sorl_model, sorl_config)

print(f"Original sequence length: {dummy_data.shape[1]}")
print(f"Sequence length after search (with abstractions): {best_sequence.shape[1]}")
print(f"Abstraction switch ratio: {switch_ratio:.2f}")
# The switch ratio indicates how often the algorithm preferred a sampled abstraction over the greedy one.
assert best_sequence.shape[0] == batch_size
assert best_sequence.shape[1] > seq_len # Should be longer due to added placeholders
print("✅ `sorl_search` test passed.\n")


# --- 4. Test Loss Computation ---
print("="*80)
print("--- Testing Loss Computation ---")
print("="*80)

# Compute the loss on the improved sequence found by the search.
# This is what would be used for the backward pass during training.
ppt = compute_per_token_loss(sorl_model, best_sequence)
ssl_loss, abs_loss = compute_loss(best_sequence, sorl_model, ppt)
total_loss = ssl_loss + abs_loss

print(f"Per-token loss shape: {ppt.shape}")
print(f"Trajectory Loss (ssl_loss): {ssl_loss.item():.4f}")
print(f"Abstraction Loss (abs_loss): {abs_loss.item():.4f}")
print(f"Total Loss: {total_loss.item():.4f}")
assert ssl_loss.ndim == 0 and abs_loss.ndim == 0
print("✅ Loss computation test passed.\n")


# --- 5. Test the `evaluate` Function ---
print("="*80)
print("--- Testing `evaluate` ---")
print("="*80)

# This function simulates a validation step, comparing a greedy search
# against a random search to measure the potential for improvement.
with torch.no_grad():
    greedy_ppl, improve_ppl_percent, _, _ = evaluate(dummy_data, sorl_model, n=4, config=sorl_config)

print(f"Greedy trajectory perplexity: {greedy_ppl.item():.4f}")
print(f"Search improvement over greedy: {improve_ppl_percent.item():.2f}%")
print("✅ `evaluate` test passed.")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


--- Initializing Model and SORL Configuration ---
Model Initialized. Total vocabulary size: 642
SORL Config: SORLConfig(n=4, temperature=1.0, K=8, causal_rollout=False, budget=None, l=1, steps=4, max_t_search=32, start_ts=None, end_ts=None, abstract_budget=5, use_rhythmic_placeholders=True, use_spike_placeholders=False, curriculum_ratio=0.6, max_seq_len=None, use_fade_memory=False, min_keep=1024, train_dataset_path=None, val_dataset_path=None, train_batch_size=128, val_batch_size=128, train_iterations=1000, val_iterations=10, max_length=1024, learning_rate=0.001, log_interval=100)

Created dummy data with shape: torch.Size([2, 128])

--- Testing `sorl_search` ---
Original sequence length: 128
Sequence length after search (with abstractions): 143
Abstraction switch ratio: 1.00
✅ `sorl_search` test passed.

--- Testing Loss Computation ---
Per-token loss shape: torch.Size([2, 142])
Trajectory Loss (ssl_loss): 6.6709
Abstraction Loss (abs_loss): 6.3879
Total Loss: 13.0588
✅ Loss computati

In [1]:
# Load language modeling data inside & train on them ~ 

import torch
from transformers import AutoTokenizer
from model.model_sorl import SorlModelWrapper
from model.model_minimind import MiniMindConfig
from src.sorl import SORLConfig, sorl_search, compute_per_token_loss, compute_loss
from dataset.utils import get_data_loader

# --- 1. Setup Model, Tokenizer, and Configuration ---
print("="*80)
print("--- Initializing Model, Tokenizer, and SORL Configuration ---")
print("="*80)

# Load the tokenizer that will be used for the dataset.
# This should match the tokenizer your model was trained with.
try:
    tokenizer = AutoTokenizer.from_pretrained('model/')
    print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
except Exception as e:
    print(f"Could not load tokenizer from 'model/'. Make sure tokenizer files are present. Error: {e}")
    # Fallback to a default tokenizer if needed for the test to run
    # tokenizer = AutoTokenizer.from_pretrained('gpt2') 
    # print("Using gpt2 tokenizer as a fallback.")

# Initialize a SORL-wrapped MiniMind model for the test.
# The base vocab size should match the tokenizer's.
base_vocab_size = tokenizer.vocab_size
abstract_vocab_sizes = [128]
full_vocab_list = [base_vocab_size] + abstract_vocab_sizes

minimind_config = MiniMindConfig(
    hidden_size=64, # Using smaller dimensions for faster testing
    num_attention_heads=2,
    num_hidden_layers=2,
    intermediate_size=128,
    vocab_size=sum(full_vocab_list)
)
sorl_model = SorlModelWrapper(
    config=minimind_config,
    full_vocab_size_list=full_vocab_list,
    memory_span=1024
)
print(f"Model Initialized. Total vocabulary size: {sorl_model.model.config.vocab_size}\n")

# --- 2. Test the Data Loader ---
print("="*80)
print("--- Testing `get_data_loader` with real data ---")
print("="*80)

# Create a configuration for the SORL search algorithm, now including dataset paths
sorl_config = SORLConfig(
    n=4,
    temperature=1.0,
    K=8,
    l=1,
    steps=4,
    max_t_search=32,
    train_dataset_path='dataset/pretrain_hq.jsonl',
    train_batch_size=2,
    max_length=256, # Using a smaller sequence length for faster testing
    use_rhythmic_placeholders=True,
    use_spike_placeholders=False
)

# Create the data loader using the new utility function

train_loader = get_data_loader(
    dataset_path=sorl_config.train_dataset_path,
    tokenizer=tokenizer,
    batch_size=sorl_config.train_batch_size,
    max_length=sorl_config.max_length
)

# Fetch one batch of data
X, Y, loss_mask = next(iter(train_loader))

print(f"Successfully loaded one batch of data.")
print(f"Batch shape (X): {X.shape}")
print(f"Data type: {X.dtype}")
print("✅ `get_data_loader` test passed.\n")



The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


--- Initializing Model, Tokenizer, and SORL Configuration ---
Tokenizer loaded. Vocab size: 6400
Model Initialized. Total vocabulary size: 6530

--- Testing `get_data_loader` with real data ---
Successfully loaded one batch of data.
Batch shape (X): torch.Size([2, 255])
Data type: torch.int64
✅ `get_data_loader` test passed.



In [7]:
from dataset.base import MemLoader

dataset = MemLoader(filepath='dataset/pretrain_hq.bin', device="cpu")

# dataset.get_batch(batch_size=10) # now this is immediate

In [9]:
data, mask = dataset.get_batch(batch_size=10) # now this is immediate

# train with SoRL missing here


In [1]:
import torch
from transformers import AutoTokenizer
from dataset.base import MemLoader
from model.model_sorl import SorlModelWrapper
from model.model_minimind import MiniMindConfig
from src.sorl import SORLConfig, sorl_search, compute_per_token_loss, compute_loss

# --- 1. Full Pipeline Initialization ---
print("--- Initializing training components ---")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer to get vocab size and pad token id
tokenizer = AutoTokenizer.from_pretrained('model/')
pad_token_id = tokenizer.pad_token_id

# Initialize the high-performance memory-mapped data loader
dataset = MemLoader('dataset/pretrain_hq.bin', device=device)
print("MemLoader initialized.")

# Initialize the SORL-wrapped model
base_vocab_size = tokenizer.vocab_size
abstract_vocab_sizes = [128]
full_vocab_list = [base_vocab_size] + abstract_vocab_sizes
minimind_config = MiniMindConfig(
    hidden_size=256, num_attention_heads=4, num_hidden_layers=4,
    intermediate_size=512, vocab_size=sum(full_vocab_list)
)
# The .to(device) call will now work correctly
sorl_model = SorlModelWrapper.from_scratch(
    config=minimind_config,
    full_vocab_size_list=full_vocab_list,
    memory_span=1024
).to(device)
print("SORL Model initialized.")

# Configure the SORL search algorithm
sorl_config = SORLConfig(
    n=4, temperature=1.0, K=8, l=1, steps=4, max_t_search=32,
    use_rhythmic_placeholders=True, use_spike_placeholders=False
)

# Set up the optimizer
optimizer = torch.optim.Adam(sorl_model.model.parameters(), lr=1e-4)
print("--- Initialization Complete ---\n")


# --- 2. Perform a Single SORL Training Step ---
print("--- Running one SORL training step ---")
# Get a batch of data instantly
data_batch, _ = dataset.get_batch(batch_size=4)
print(f"Fetched data batch of shape: {data_batch.shape}")

# a) SORL Search Step (run in no_grad context)
with torch.no_grad():
    search_data, switch_ratio = sorl_search(data_batch, sorl_model, sorl_config)
print(f"SORL search complete. New sequence shape: {search_data.shape}")

# b) Forward Pass: Compute per-token loss on the "improved" data
ppt = compute_per_token_loss(sorl_model, search_data)

# c) Compute final SORL loss (combining trajectory and abstraction losses)
ssl_loss, abs_loss = compute_loss(search_data, sorl_model, ppt)
total_loss = ssl_loss + abs_loss
print(f"Computed Loss -> Total: {total_loss.item():.4f} (SSL: {ssl_loss.item():.4f}, Abs: {abs_loss.item():.4f})")

# d) Backward Pass and Optimizer Step
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
print("Optimizer step complete (weights have been updated).")

print("\n--- ✅ Single training step finished! ---")

--- Initializing training components ---


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


MemLoader initialized.
SORL Model initialized.
--- Initialization Complete ---

--- Running one SORL training step ---
Fetched data batch of shape: torch.Size([4, 255])
SORL search complete. New sequence shape: torch.Size([4, 286])
Computed Loss -> Total: 17.7018 (SSL: 8.9489, Abs: 8.7529)
Optimizer step complete (weights have been updated).

--- ✅ Single training step finished! ---
