# Day 5: Tokenizer and Training Tutorial

이 노트북에서는 토크나이저 구현, 학습 루프 구성, 그리고 텍스트 생성을 실습합니다.

In [None]:
# 필요한 라이브러리 import
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '../../..'))

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# 우리가 구현한 모듈들
from core.tokenizer import CharacterTokenizer, SimpleBPETokenizer, DataLoader
from core.training import (
    cross_entropy_loss, perplexity, 
    SGD, Adam, TextGenerator,
    gradient_clipping
)

## Part 1: Character-level Tokenizer

가장 간단한 토크나이저부터 시작합니다.

In [None]:
# Sample text
sample_text = """
The quick brown fox jumps over the lazy dog.
A journey of a thousand miles begins with a single step.
To be or not to be, that is the question.
"""

# Character tokenizer 생성 및 학습
char_tokenizer = CharacterTokenizer()
char_tokenizer.fit(sample_text)

print(f"Vocabulary size: {char_tokenizer.vocab_size}")
print(f"First 20 characters in vocab: {list(char_tokenizer.char_to_id.keys())[:20]}")

In [None]:
# Encoding 테스트
test_sentence = "The quick fox"
encoded = char_tokenizer.encode(test_sentence)
print(f"Original: {test_sentence}")
print(f"Encoded: {encoded}")

# Decoding 테스트
decoded = char_tokenizer.decode(encoded)
print(f"Decoded: {decoded}")

# Special tokens 포함
encoded_special = char_tokenizer.encode(test_sentence, add_special_tokens=True)
print(f"\nWith special tokens: {encoded_special}")
print(f"Decoded: {char_tokenizer.decode(encoded_special, skip_special_tokens=False)}")

## Part 2: BPE Tokenizer

Byte Pair Encoding을 구현하여 더 효율적인 토큰화를 수행합니다.

In [None]:
# BPE Tokenizer 생성 및 학습
bpe_tokenizer = SimpleBPETokenizer(vocab_size=256)
bpe_tokenizer.fit(sample_text, num_merges=50)

# 학습된 merges 확인
print("First 10 merges:")
for i, (a, b) in enumerate(bpe_tokenizer.merges[:10]):
    print(f"{i+1}. '{a}' + '{b}' -> '{a}{b}'")

In [None]:
# BPE vs Character tokenization 비교
test_text = "The quick brown fox"

char_encoded = char_tokenizer.encode(test_text)
bpe_encoded = bpe_tokenizer.encode(test_text)

print(f"Original text: {test_text}")
print(f"Character tokens: {len(char_encoded)} tokens")
print(f"BPE tokens: {len(bpe_encoded)} tokens")
print(f"\nCompression ratio: {len(char_encoded) / len(bpe_encoded):.2f}x")

## Part 3: DataLoader 구현

학습을 위한 배치 데이터 준비

In [None]:
# DataLoader 생성
batch_size = 2
seq_length = 10

dataloader = DataLoader(
    text=sample_text,
    tokenizer=char_tokenizer,
    batch_size=batch_size,
    seq_length=seq_length
)

print(f"Number of tokens: {dataloader.num_tokens}")
print(f"Number of batches: {dataloader.num_batches}")

In [None]:
# 배치 데이터 확인
inputs, targets = dataloader.get_batch(0)

print("First batch:")
print(f"Input shape: {np.array(inputs).shape}")
print(f"Target shape: {np.array(targets).shape}")

print("\nFirst sequence:")
print(f"Input: {char_tokenizer.decode(inputs[0])}")
print(f"Target: {char_tokenizer.decode(targets[0])}")
print("\nNote: Target is input shifted by 1 position")

## Part 4: Loss Functions

Cross-entropy loss와 perplexity 계산

In [None]:
# Dummy predictions for testing
vocab_size = char_tokenizer.vocab_size
batch_size = 2
seq_length = 10

# Random logits
logits = np.random.randn(batch_size, seq_length, vocab_size)

# Random targets
targets = np.random.randint(0, vocab_size, (batch_size, seq_length))

# Calculate loss
loss = cross_entropy_loss(logits, targets)
ppl = perplexity(loss)

print(f"Cross-entropy loss: {loss:.4f}")
print(f"Perplexity: {ppl:.2f}")
print(f"\nExpected random loss: {np.log(vocab_size):.4f}")
print(f"Expected random perplexity: {vocab_size:.2f}")

## Part 5: Optimizers

SGD와 Adam optimizer 비교

In [None]:
# Simple optimization example
class SimpleParam:
    def __init__(self, value):
        self.data = np.array(value, dtype=float)
        self.grad = None

# Create parameters
param_sgd = SimpleParam([1.0, 2.0, 3.0])
param_adam = SimpleParam([1.0, 2.0, 3.0])

# Create optimizers
sgd = SGD([param_sgd], learning_rate=0.1)
adam = Adam([param_adam], learning_rate=0.1)

# Simulate gradients and updates
sgd_history = [param_sgd.data.copy()]
adam_history = [param_adam.data.copy()]

for step in range(10):
    # Simulate gradient (toward zero)
    param_sgd.grad = param_sgd.data
    param_adam.grad = param_adam.data
    
    # Update
    sgd.step()
    adam.step()
    
    sgd_history.append(param_sgd.data.copy())
    adam_history.append(param_adam.data.copy())
    
    sgd.zero_grad()
    adam.zero_grad()

In [None]:
# Visualize optimization
sgd_history = np.array(sgd_history)
adam_history = np.array(adam_history)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(sgd_history[:, 0], label='SGD')
plt.plot(adam_history[:, 0], label='Adam')
plt.xlabel('Step')
plt.ylabel('Parameter Value')
plt.title('Optimization Comparison')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(np.linalg.norm(sgd_history, axis=1), label='SGD')
plt.plot(np.linalg.norm(adam_history, axis=1), label='Adam')
plt.xlabel('Step')
plt.ylabel('L2 Norm')
plt.title('Parameter Norm Over Time')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## Part 6: Text Generation Strategies

다양한 텍스트 생성 전략 비교

In [None]:
# Dummy model for demonstration
class DummyModel:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    
    def forward(self, input_ids):
        # Return random logits for demo
        batch_size = 1 if len(np.array(input_ids).shape) == 1 else input_ids.shape[0]
        seq_len = len(input_ids) if len(np.array(input_ids).shape) == 1 else input_ids.shape[1]
        
        # Make some tokens more likely for interesting generation
        logits = np.random.randn(batch_size, seq_len, self.vocab_size)
        # Boost some specific tokens
        if 'T' in char_tokenizer.char_to_id:
            logits[:, :, char_tokenizer.char_to_id['T']] += 1
        if 'h' in char_tokenizer.char_to_id:
            logits[:, :, char_tokenizer.char_to_id['h']] += 0.5
        
        return logits

# Create model and generator
model = DummyModel(char_tokenizer.vocab_size)
generator = TextGenerator(model, char_tokenizer)

In [None]:
# Compare different generation strategies
prompt = "The "

print("Different generation strategies:\n")

# Greedy (temperature=0)
generated = generator.generate(prompt, max_length=20, temperature=0)
print(f"Greedy: {generated}")

# Low temperature
generated = generator.generate(prompt, max_length=20, temperature=0.5)
print(f"Temp=0.5: {generated}")

# Normal temperature
generated = generator.generate(prompt, max_length=20, temperature=1.0)
print(f"Temp=1.0: {generated}")

# High temperature
generated = generator.generate(prompt, max_length=20, temperature=2.0)
print(f"Temp=2.0: {generated}")

# Top-k sampling
generated = generator.generate(prompt, max_length=20, top_k=5)
print(f"Top-k=5: {generated}")

# Top-p sampling
generated = generator.generate(prompt, max_length=20, top_p=0.9)
print(f"Top-p=0.9: {generated}")

## Part 7: Mini Language Model Training

작은 언어 모델을 실제로 학습시켜 봅시다.

In [None]:
# Mini LM for character-level modeling
class MiniLM:
    def __init__(self, vocab_size, hidden_size=64):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # Simple parameters
        self.embedding = np.random.randn(vocab_size, hidden_size) * 0.01
        self.output = np.random.randn(hidden_size, vocab_size) * 0.01
        
    def forward(self, input_ids):
        # Simple forward pass
        if isinstance(input_ids, list):
            input_ids = np.array(input_ids)
        
        batch_size = 1 if len(input_ids.shape) == 1 else input_ids.shape[0]
        seq_len = input_ids.shape[-1]
        
        # Embedding lookup
        embedded = np.zeros((batch_size, seq_len, self.hidden_size))
        for b in range(batch_size):
            for t in range(seq_len):
                if len(input_ids.shape) == 1:
                    embedded[b, t] = self.embedding[input_ids[t]]
                else:
                    embedded[b, t] = self.embedding[input_ids[b, t]]
        
        # Output projection
        logits = embedded @ self.output
        
        return logits
    
    def parameters(self):
        return [self.embedding, self.output]
    
    def backward(self, loss):
        # Simplified - just set dummy gradients
        self.embedding.grad = np.random.randn(*self.embedding.shape) * 0.01
        self.output.grad = np.random.randn(*self.output.shape) * 0.01

In [None]:
# Train mini LM
mini_lm = MiniLM(char_tokenizer.vocab_size, hidden_size=32)
optimizer = Adam([mini_lm.embedding, mini_lm.output], learning_rate=0.01)

# Training loop
losses = []
print("Training mini language model...")

for epoch in range(5):
    epoch_loss = 0
    batch_count = 0
    
    for inputs, targets in dataloader:
        # Forward pass
        inputs = np.array(inputs)
        targets = np.array(targets)
        
        logits = mini_lm.forward(inputs)
        loss = cross_entropy_loss(logits, targets)
        
        # Backward pass (simplified)
        mini_lm.backward(loss)
        
        # Gradient clipping
        gradient_clipping([mini_lm.embedding, mini_lm.output], max_norm=1.0)
        
        # Optimizer step
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_loss += loss
        batch_count += 1
    
    avg_loss = epoch_loss / batch_count if batch_count > 0 else 0
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Perplexity = {perplexity(avg_loss):.2f}")

In [None]:
# Plot training curve
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot([perplexity(l) for l in losses])
plt.xlabel('Epoch')
plt.ylabel('Perplexity')
plt.title('Training Perplexity')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Generate text with trained model
trained_generator = TextGenerator(mini_lm, char_tokenizer)

prompts = ["The ", "A ", "To "]

print("Text generation with trained model:\n")
for prompt in prompts:
    generated = trained_generator.generate(
        prompt, 
        max_length=30, 
        temperature=0.8,
        top_k=10
    )
    print(f"Prompt: '{prompt}' -> {generated}")

## Summary

이 튜토리얼에서 다룬 내용:

1. **Tokenization**
   - Character-level tokenizer
   - BPE tokenizer
   - Vocabulary 관리

2. **Training Components**
   - DataLoader
   - Loss functions
   - Optimizers (SGD, Adam)

3. **Text Generation**
   - Temperature sampling
   - Top-k sampling
   - Top-p sampling

4. **Mini LM Training**
   - Training loop
   - Gradient clipping
   - Loss monitoring

## Next Steps

- 더 큰 데이터셋으로 학습
- 실제 Transformer 모델과 연결
- Advanced tokenization (SentencePiece)
- Pre-training strategies