# 🐛 Nanochat Bug Hunt: Tokenizer & Data Loading

Welcome to the first debugging challenge! In this notebook, you'll find and fix three bugs:

1. **Tokenizer Bug**: BOS tokens aren't being prepended
2. **Data Type Bug**: Wrong tensor dtype causing CUDA errors
3. **Off-by-One Bug**: Model learns to predict current token instead of next

Let's start by setting up a tiny training run and see what goes wrong!

In [None]:
# Setup: imports and environment
import os
import sys
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# Add nanochat to path
repo_root = Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

# Device selection
device = 'cuda' if torch.cuda.is_available() else 'mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

# Set up minimal cache directory
os.environ["NANOCHAT_BASE_DIR"] = os.path.join(repo_root, ".cache_debug")
os.makedirs(os.environ["NANOCHAT_BASE_DIR"], exist_ok=True)

## Step 1: Create a Tiny Dataset and Train a Small Tokenizer

First, let's create a minimal dataset and train a tokenizer to work with.

In [None]:
# Create a tiny text dataset
tiny_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "To be or not to be, that is the question.",
    "Hello world! This is a test of the tokenizer.",
    "Machine learning is fascinating and powerful.",
    "Python is a great programming language for AI.",
] * 100  # Repeat to get enough data for BPE

# Save as a simple text file
data_path = Path(os.environ["NANOCHAT_BASE_DIR"]) / "tiny_data.txt"
with open(data_path, 'w') as f:
    f.write('\\n'.join(tiny_texts))

print(f"Created dataset with {len(tiny_texts)} lines")
print(f"First line: {tiny_texts[0]}")

In [None]:
# Train a tiny tokenizer
from nanochat.tokenizer import RustBPETokenizer

# Create an iterator from our text
def text_iterator():
    with open(data_path, 'r') as f:
        for line in f:
            yield line.strip()

# Train tokenizer with small vocab
print("Training tokenizer...")
tokenizer = RustBPETokenizer.train_from_iterator(text_iterator(), vocab_size=512)
print(f"Tokenizer vocab size: {tokenizer.get_vocab_size()}")

# Save it
tokenizer_dir = Path(os.environ["NANOCHAT_BASE_DIR"]) / "tokenizer"
tokenizer.save(str(tokenizer_dir))

## Step 2: Test the Tokenizer - Find Bug #1

Let's test if the tokenizer correctly prepends BOS tokens when asked.

In [None]:
# Test tokenizer encoding with BOS token
test_text = "Hello world!"
bos_token_id = tokenizer.get_bos_token_id()
print(f"BOS token ID: {bos_token_id}")

# Encode without prepending
tokens_no_bos = tokenizer.encode(test_text)
print(f"\\nWithout BOS: {tokens_no_bos}")
print(f"Decoded: {tokenizer.decode(tokens_no_bos)}")

# Encode WITH prepending BOS
tokens_with_bos = tokenizer.encode(test_text, prepend="<|bos|>")
print(f"\\nWith BOS (should start with {bos_token_id}): {tokens_with_bos}")
print(f"Decoded: {tokenizer.decode(tokens_with_bos)}")

# 🐛 BUG CHECK: Does the token list start with BOS?
if tokens_with_bos[0] != bos_token_id:
    print("\\n❌ BUG FOUND! BOS token is not being prepended!")
    print("💡 Hint: Check the encode() method in tokenizer.py")
else:
    print("\\n✅ BOS token correctly prepended!")

## Fix Bug #1: BOS Token Prepending

If you found the bug above, go fix it in `nanochat/tokenizer.py`!

Look for the `encode()` method in the `RustBPETokenizer` class. The prepending logic has been commented out!

In [None]:
# After fixing, reload the tokenizer module and test again
import importlib
import nanochat.tokenizer
importlib.reload(nanochat.tokenizer)
from nanochat.tokenizer import RustBPETokenizer

# Reload tokenizer
tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

# Test again
tokens_with_bos = tokenizer.encode(test_text, prepend="<|bos|>")
if tokens_with_bos[0] == bos_token_id:
    print("✅ Bug #1 FIXED! BOS token now prepended correctly!")
else:
    print("❌ Bug #1 still present. Check your fix!")

## Step 3: Create a Tiny Model and Try Training - Find Bug #2

Now let's create a small model and attempt to train it. This will reveal the data type bug.

In [None]:
# Create a tiny GPT model
from nanochat.gpt import GPT, GPTConfig

# Very small config for quick testing
config = GPTConfig(
    sequence_len=64,
    vocab_size=tokenizer.get_vocab_size(),
    n_layer=2,      # Just 2 layers
    n_head=2,       # 2 heads
    n_kv_head=2,    # Same as n_head (no MQA)
    n_embd=64,      # Small embedding dim
)

model = GPT(config)
model.init_weights()
model = model.to(device)
print(f"Model created with {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
# Create a simple data loader
# First, let's tokenize our data and save it
import pickle

# Tokenize all our texts
all_tokens = []
for text in tiny_texts:
    tokens = tokenizer.encode(text, prepend="<|bos|>")
    all_tokens.extend(tokens)

# Save as a simple binary file
tokens_path = Path(os.environ["NANOCHAT_BASE_DIR"]) / "tokens.bin"
tokens_array = np.array(all_tokens, dtype=np.uint16)
tokens_array.tofile(tokens_path)

print(f"Total tokens: {len(all_tokens)}")
print(f"Saved to: {tokens_path}")

In [None]:
# Create a simple batch getter
def get_batch(batch_size=4, seq_len=64):
    """Get a batch of data for training"""
    # Load tokens
    data = np.fromfile(tokens_path, dtype=np.uint16)
    
    # Random starting positions
    ix = torch.randint(len(data) - seq_len - 1, (batch_size,))
    
    # Create batch
    batch_tokens = []
    for i in ix:
        batch_tokens.append(data[i:i+seq_len+1].tolist())
    
    # Now use our dataloader logic
    from nanochat.dataloader import tokenizing_distributed_data_loader
    
    # We'll directly use the dataloader's batch creation logic
    from collections import deque
    token_buffer = deque()
    for tokens in batch_tokens:
        token_buffer.extend(tokens)
    
    B, T = batch_size, seq_len
    needed_tokens = B * T + 1
    
    # This mimics the dataloader logic where the bug is
    tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
    scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda"))
    
    # BUG 2: Wrong dtype - using int64 for inputs instead of int32
    inputs_cpu = scratch[:-1].to(dtype=torch.int64)  # Should be int32!
    targets_cpu = scratch[1:]
    
    # Reshape to 2D and move to device
    try:
        inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
        targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
        return inputs, targets
    except Exception as e:
        print(f"❌ Error creating batch: {e}")
        print(f"💡 Hint: Check the dtype conversions in the dataloader!")
        print(f"Input CPU dtype: {inputs_cpu.dtype}")
        print(f"Target CPU dtype: {targets_cpu.dtype}")
        raise e

In [None]:
# Try to get a batch - this should reveal Bug #2
try:
    inputs, targets = get_batch()
    print(f"✅ Batch created successfully!")
    print(f"Inputs shape: {inputs.shape}, dtype: {inputs.dtype}")
    print(f"Targets shape: {targets.shape}, dtype: {targets.dtype}")
except RuntimeError as e:
    print(f"\\n🐛 BUG #2 DETECTED!")
    print(f"The error suggests a dtype mismatch.")
    print(f"\\n💡 Fix hint: In dataloader.py, check the dtype of inputs_cpu")
    print(f"It's being set to int64 but then converted to int32, causing issues!")

## Fix Bug #2: Data Type Mismatch

Go to `nanochat/dataloader.py` and fix the dtype issue in the `tokenizing_distributed_data_loader` function.

Look for where `inputs_cpu` is created - it should use `torch.int32`, not `torch.int64`!

In [None]:
# After fixing, reload and test
import nanochat.dataloader
importlib.reload(nanochat.dataloader)

# Recreate the batch getter with fixed code
def get_batch_fixed(batch_size=4, seq_len=64):
    """Get a batch of data for training - with fix applied"""
    data = np.fromfile(tokens_path, dtype=np.uint16)
    ix = torch.randint(len(data) - seq_len - 1, (batch_size,))
    
    from collections import deque
    token_buffer = deque()
    for i in ix:
        token_buffer.extend(data[i:i+seq_len+1].tolist())
    
    B, T = batch_size, seq_len
    needed_tokens = B * T + 1
    
    tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
    scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda"))
    
    # FIXED: Use int32 for inputs
    inputs_cpu = scratch[:-1].to(dtype=torch.int32)
    targets_cpu = scratch[1:]  # Still has bug #3!
    
    inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
    targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
    return inputs, targets

# Test
try:
    inputs, targets = get_batch_fixed()
    print("✅ Bug #2 FIXED! Batch created successfully!")
    print(f"Inputs: {inputs.shape}, dtype: {inputs.dtype}")
    print(f"Targets: {targets.shape}, dtype: {targets.dtype}")
except Exception as e:
    print(f"❌ Still having issues: {e}")

## Step 4: Train the Model and Discover Bug #3

Now let's train the model briefly and see if it learns properly. Bug #3 will cause the model to learn the wrong pattern!

In [None]:
# Quick training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
losses = []

print("Training for 100 steps...")
for step in range(100):
    inputs, targets = get_batch_fixed()
    
    # Forward pass
    loss = model(inputs, targets)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    
    if step % 20 == 0:
        print(f"Step {step}: loss = {loss.item():.4f}")

# Plot loss
plt.plot(losses)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

In [None]:
# Now let's test what the model learned - this will reveal Bug #3!
model.eval()

# Create a test sequence
test_tokens = tokenizer.encode("The quick", prepend="<|bos|>")
print(f"Input tokens: {test_tokens}")
print(f"Input text: '{tokenizer.decode(test_tokens)}'")

# Get model predictions
with torch.no_grad():
    input_tensor = torch.tensor([test_tokens], dtype=torch.int32).to(device)
    logits = model(input_tensor)
    
    # Get the predicted tokens
    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_ids = predicted_ids[0].cpu().tolist()

print(f"\\nPredicted tokens: {predicted_ids}")
print(f"Predicted text: '{tokenizer.decode(predicted_ids)}'")

# Check if predictions match inputs (they shouldn't!)
if predicted_ids[:len(test_tokens)-1] == test_tokens[:-1]:
    print("\\n❌ BUG #3 DETECTED! Model is predicting current token instead of next token!")
    print("💡 This means targets = inputs, not inputs shifted by 1")
    print("💡 Check the dataloader where targets are created from scratch")
else:
    print("\\n✅ Model correctly predicting next tokens!")

## Fix Bug #3: Off-by-One Error

The model is learning to predict the current token instead of the next token!

Go to `nanochat/dataloader.py` and fix the targets creation:
- Currently: `targets_cpu = scratch[:-1]` (WRONG - same as inputs!)
- Should be: `targets_cpu = scratch[1:]` (shifted by 1)

In [None]:
# Final test with all fixes
importlib.reload(nanochat.dataloader)

# Create fully fixed batch getter
def get_batch_fully_fixed(batch_size=4, seq_len=64):
    """Get a batch of data for training - all bugs fixed"""
    data = np.fromfile(tokens_path, dtype=np.uint16)
    ix = torch.randint(len(data) - seq_len - 1, (batch_size,))
    
    from collections import deque
    token_buffer = deque()
    for i in ix:
        token_buffer.extend(data[i:i+seq_len+1].tolist())
    
    B, T = batch_size, seq_len
    needed_tokens = B * T + 1
    
    tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
    scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda"))
    
    # ALL FIXES APPLIED:
    inputs_cpu = scratch[:-1].to(dtype=torch.int32)  # Fixed: int32
    targets_cpu = scratch[1:]  # Fixed: shifted by 1
    
    inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
    targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
    return inputs, targets

# Verify the fix
inputs, targets = get_batch_fully_fixed(batch_size=1, seq_len=10)
print("Input tokens:", inputs[0].cpu().tolist())
print("Target tokens:", targets[0].cpu().tolist())
print("\\nTargets should be inputs shifted by 1:")
print("✅ Correct!" if inputs[0, 1:].cpu().tolist() == targets[0, :-1].cpu().tolist() else "❌ Still wrong!")

In [None]:
# Retrain with all fixes applied
print("🎉 All bugs fixed! Let's train a working model!\\n")

# Reinitialize model
model = GPT(config)
model.init_weights()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

losses = []
for step in range(200):
    inputs, targets = get_batch_fully_fixed()
    loss = model(inputs, targets)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    
    if step % 40 == 0:
        print(f"Step {step}: loss = {loss.item():.4f}")

plt.plot(losses)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss (All Bugs Fixed)')
plt.show()

print("\\n✅ Success! The model is now training correctly!")

In [None]:
# Final generation test
from nanochat.engine import Engine

model.eval()
engine = Engine(model, tokenizer)

# Generate some text
prompt = "The quick"
prompt_tokens = tokenizer.encode(prompt, prepend="<|bos|>")

print(f"Prompt: '{prompt}'")
print(f"Generating...\\n")

generated, _ = engine.generate_batch(prompt_tokens, num_samples=1, max_tokens=20, temperature=0.8)
generated_text = tokenizer.decode(generated[0])

print(f"Generated: '{generated_text}'")
print("\\n🎉 Congratulations! You've fixed all three bugs!")
print("\\nSummary of fixes:")
print("1. ✅ BOS token prepending in tokenizer.encode()")
print("2. ✅ Data type mismatch (int64 → int32) in dataloader")
print("3. ✅ Off-by-one error in target creation")