# Level 1: Tokenization & Data Pipeline

**Objective:** Fix bugs in the tokenization and data loading pipeline.

**Acceptance Criteria:**
- All tests in `tests/test_level1.py` pass
- Model trains without errors
- Training loss decreases consistently

**Time estimate:** 30-45 minutes

In [None]:
import os
import sys
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

repo_root = Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

device = 'cuda' if torch.cuda.is_available() else 'mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else 'cpu'
print(f"Device: {device}")

os.environ["NANOCHAT_BASE_DIR"] = os.path.join(repo_root, ".cache_level1")
os.makedirs(os.environ["NANOCHAT_BASE_DIR"], exist_ok=True)

## Setup: Train Tokenizer

In [None]:
from nanochat.tokenizer import RustBPETokenizer

tokenizer_dir = Path(os.environ["NANOCHAT_BASE_DIR"]) / "tokenizer"

if not tokenizer_dir.exists():
    texts = ["The quick brown fox jumps over the lazy dog."] * 1000
    tokenizer = RustBPETokenizer.train_from_iterator(iter(texts), vocab_size=512)
    tokenizer.save(str(tokenizer_dir))
else:
    tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

print(f"Tokenizer ready: {tokenizer.get_vocab_size()} tokens")

## Test 1: Tokenizer Encoding

In [None]:
# Test tokenizer with BOS prepending
test_text = "Hello world"
bos_id = tokenizer.get_bos_token_id()

tokens_with_bos = tokenizer.encode(test_text, prepend="<|bos|>")
tokens_without = tokenizer.encode(test_text)

print(f"BOS token ID: {bos_id}")
print(f"With BOS: {tokens_with_bos[:5]}")
print(f"Without BOS: {tokens_without[:5]}")

# Acceptance test
assert tokens_with_bos[0] == bos_id, f"FAIL: First token should be BOS ({bos_id}), got {tokens_with_bos[0]}"
assert len(tokens_with_bos) == len(tokens_without) + 1, "FAIL: BOS token not added"
print("✓ Test 1 passed")

## Test 2: Data Loading Pipeline

In [None]:
# Create test data
texts = ["The quick brown fox"] * 100
all_tokens = []
for text in texts:
    all_tokens.extend(tokenizer.encode(text, prepend="<|bos|>"))

tokens_path = Path(os.environ["NANOCHAT_BASE_DIR"]) / "tokens.bin"
np.array(all_tokens, dtype=np.uint16).tofile(tokens_path)
print(f"Created test data: {len(all_tokens)} tokens")

In [None]:
# Test batch creation
from collections import deque

def create_batch(batch_size=2, seq_len=10):
    data = np.fromfile(tokens_path, dtype=np.uint16)
    token_buffer = deque(data[:batch_size * seq_len + 1].tolist())
    
    B, T = batch_size, seq_len
    needed_tokens = B * T + 1
    tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
    scratch = torch.tensor(tokens, dtype=torch.int64)
    
    # This mimics dataloader logic
    inputs_cpu = scratch[:-1].to(dtype=torch.int32)
    targets_cpu = scratch[1:]
    
    inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32)
    targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64)
    return inputs, targets

try:
    inputs, targets = create_batch()
    print(f"Inputs: {inputs.shape}, dtype={inputs.dtype}")
    print(f"Targets: {targets.shape}, dtype={targets.dtype}")
    
    # Acceptance tests
    assert inputs.dtype == torch.int32, f"FAIL: inputs should be int32, got {inputs.dtype}"
    assert targets.dtype == torch.int64, f"FAIL: targets should be int64, got {targets.dtype}"
    
    # Verify autoregressive property: targets should be inputs shifted by 1
    inputs_flat = inputs.flatten().cpu()
    targets_flat = targets.flatten().cpu()
    assert not torch.equal(inputs_flat, targets_flat), "FAIL: targets should not equal inputs"
    assert torch.equal(inputs_flat[1:], targets_flat[:-1]), "FAIL: targets should be inputs shifted by 1"
    
    print("✓ Test 2 passed")
except Exception as e:
    print(f"✗ Test 2 failed: {e}")

## Test 3: Training Smoke Test

In [None]:
from nanochat.gpt import GPT, GPTConfig

config = GPTConfig(
    sequence_len=64,
    vocab_size=tokenizer.get_vocab_size(),
    n_layer=2,
    n_head=2,
    n_kv_head=2,
    n_embd=64,
)

model = GPT(config)
model.init_weights()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

print(f"Model: {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
# Train for a few steps
losses = []
for step in range(50):
    inputs, targets = create_batch(batch_size=4, seq_len=32)
    loss = model(inputs, targets)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    if step % 10 == 0:
        print(f"Step {step}: loss = {loss.item():.4f}")

# Acceptance test: loss should decrease
initial_loss = np.mean(losses[:10])
final_loss = np.mean(losses[-10:])
improvement = (initial_loss - final_loss) / initial_loss

print(f"\nInitial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement*100:.1f}%")

assert improvement > 0.1, f"FAIL: Loss should improve by >10%, got {improvement*100:.1f}%"
print("✓ Test 3 passed")

plt.plot(losses)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

## Summary

All tests passed! The tokenization and data loading pipeline is working correctly.