#### Self-organizing LLM wrapper

In [9]:
import torch
from model.model_sorl import SorlModelWrapper
from model.model_sorl import infer_level
from model.model_minimind import MiniMindConfig

full_vocab_list = [11, 50] # Base vocab + abstract vocabs
model = SorlModelWrapper.from_scratch(
    config=MiniMindConfig(vocab_size=sum(full_vocab_list)), # Config needs the total new vocab size
    full_vocab_size_list=full_vocab_list,
    memory_span=5,
    pad_token_id=0
)
# --- Generate text using the custom SORL logic ---

prompt = torch.tensor([[1, 2, 3]])
generated_sequence = model.generate(
    input_ids=prompt,
    max_new_tokens=50,
    temperature=0.0,
    top_k=50,
    force_abstraction_every_n=4  # Example: force an abstraction token every 10 steps
)

print("--- SORL Generation Results ---")
print("Base vocabulary size:", model.vocab_sizes[0].item())
print("Total vocabulary size:", model.model.config.vocab_size)
print("\nGenerated Sequence:", generated_sequence)


result = model.forward(prompt)
print("\n--- Forward propagation (sparse attention) ---")
print("result.logits.shape: ", result.logits.shape)


orig_tokens = torch.tensor([[1,2,3,61,2,4,1,61,3,4,2,61]])

levels = infer_level(orig_tokens, model.vocab_sizes, -1)
denoise_mask = torch.isin(orig_tokens, model.level_mask_tokens[1:])
denoise_levels = levels[denoise_mask]

new_tokens = model.denoise(orig_tokens, denoise_mask, denoise_levels, 0.0)
print("\n--- Denoising ---")
print(f"Generating 2 level-1 tokens in parallel: {orig_tokens[0].tolist()} --> {new_tokens[0].tolist()}")

--- SORL Generation Results ---
Base vocabulary size: 11
Total vocabulary size: 62

Generated Sequence: tensor([[41, 17, 41, 41, 17, 41, 41, 17, 41, 17, 53,  1,  1,  1, 53,  1,  1]])

--- Forward propagation (sparse attention) ---
result.logits.shape:  torch.Size([1, 3, 62])

--- Denoising ---
Generating 2 level-1 tokens in parallel: [1, 2, 3, 61, 2, 4, 1, 61, 3, 4, 2, 61] --> [1, 2, 3, 60, 2, 4, 1, 16, 3, 4, 2, 17]


#### Self-organizing Reinforcement Learning

In [12]:
import torch
from transformers import AutoTokenizer
from dataset.base import MemLoader
from model.model_sorl import SorlModelWrapper
from model.model_minimind import MiniMindConfig
from src.sorl import SORLConfig, sorl_search, compute_per_token_loss, compute_loss

# --- 1. Full Pipeline Initialization ---
print("--- Initializing training components ---")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer to get vocab size and pad token id
tokenizer = AutoTokenizer.from_pretrained('model/')
pad_token_id = tokenizer.pad_token_id

# Initialize the high-performance memory-mapped data loader
dataset = MemLoader('dataset/pretrain_hq.bin', device=device)
print("MemLoader initialized.")

# Initialize the SORL-wrapped model
base_vocab_size = tokenizer.vocab_size
abstract_vocab_sizes = [128]
full_vocab_list = [base_vocab_size] + abstract_vocab_sizes
minimind_config = MiniMindConfig(
    hidden_size=256, num_attention_heads=4, num_hidden_layers=4,
    intermediate_size=512, vocab_size=sum(full_vocab_list)
)
# The .to(device) call will now work correctly
sorl_model = SorlModelWrapper.from_scratch(
    config=minimind_config,
    full_vocab_size_list=full_vocab_list,
    memory_span=1024,
    pad_token_id=0
).to(device)
print("SORL Model initialized.")

# Configure the SORL search algorithm
sorl_config = SORLConfig(
    n=4, temperature=1.0, K=8, l=1, steps=4, max_t_search=32,
    use_rhythmic_placeholders=True, use_spike_placeholders=False
)

# Set up the optimizer
optimizer = torch.optim.Adam(sorl_model.model.parameters(), lr=1e-4)
print("--- Initialization Complete ---\n")


# --- 2. Perform a Single SORL Training Step ---
print("--- Running one SORL training step ---")
# Get a batch of data instantly
data_batch, _ = dataset.get_batch(batch_size=4)
print(f"Fetched data batch of shape: {data_batch.shape}")

# a) SORL Search Step (run in no_grad context)
with torch.no_grad():
    search_data, switch_ratio = sorl_search(data_batch, sorl_model, sorl_config)
print(f"SORL search complete. New sequence shape: {search_data.shape}")

# b) Forward Pass: Compute per-token loss on the "improved" data
ppt = compute_per_token_loss(sorl_model, search_data)

# c) Compute final SORL loss (combining trajectory and abstraction losses)
ssl_loss, abs_loss = compute_loss(search_data, sorl_model, ppt)
total_loss = ssl_loss + abs_loss
print(f"Computed Loss -> Total: {total_loss.item():.4f} (SSL: {ssl_loss.item():.4f}, Abs: {abs_loss.item():.4f})")

# d) Backward Pass and Optimizer Step
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
print("Optimizer step complete (weights have been updated).")

print("\n--- ✅ Single training step finished! ---")

--- Initializing training components ---
MemLoader initialized.
SORL Model initialized.
--- Initialization Complete ---

--- Running one SORL training step ---
Fetched data batch of shape: torch.Size([4, 255])
SORL search complete. New sequence shape: torch.Size([4, 286])
Computed Loss -> Total: 17.5724 (SSL: 8.9874, Abs: 8.5850)
Optimizer step complete (weights have been updated).

--- ✅ Single training step finished! ---
