# Notebook 2: Entrenamiento del Transformer NMT

Este notebook implementa el entrenamiento completo del modelo Transformer para traducción español-inglés.

## Contenido
1. Preparación de datos (Tatoeba)
2. Configuración del modelo
3. Loop de entrenamiento
4. Monitoreo y visualización
5. Guardado de checkpoints

In [None]:
import sys
sys.path.append('..')

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.cuda.amp import GradScaler, autocast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import time
from tqdm.auto import tqdm

from src.models.transformer import TransformerConfig, Transformer
from src.data import prepare_data, Vocabulary
from src.train import Trainer, WarmupCosineScheduler
from src.utils import set_seed, get_device, count_parameters, save_checkpoint, load_checkpoint

# Configurar estilo
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Reproducibilidad
set_seed(42)

# Detectar device
device = get_device()
print(f"Usando device: {device}")

## 1. Preparación de Datos

Vamos a descargar y preparar el dataset Tatoeba de español-inglés.

In [None]:
# Configuración de datos
data_config = {
    'lang_pair': 'es-en',
    'max_samples': 20000,  # Limitar para entrenamiento rápido
    'max_length': 50,
    'min_freq': 2,
    'batch_size': 64,
    'num_workers': 4
}

print("Preparando datos...")
print(f"  - Par de idiomas: {data_config['lang_pair']}")
print(f"  - Máximo de muestras: {data_config['max_samples']}")
print(f"  - Longitud máxima: {data_config['max_length']}")
print(f"  - Batch size: {data_config['batch_size']}")

# Preparar datos
data_loaders, vocab_src, vocab_tgt = prepare_data(
    lang_pair=data_config['lang_pair'],
    max_samples=data_config['max_samples'],
    max_length=data_config['max_length'],
    min_freq=data_config['min_freq'],
    batch_size=data_config['batch_size'],
    num_workers=data_config['num_workers'],
    seed=42
)

train_loader = data_loaders['train']
val_loader = data_loaders['val']
test_loader = data_loaders['test']

print(f"\nEstadísticas de datos:")
print(f"  - Vocabulario source: {len(vocab_src)} tokens")
print(f"  - Vocabulario target: {len(vocab_tgt)} tokens")
print(f"  - Train batches: {len(train_loader)}")
print(f"  - Val batches: {len(val_loader)}")
print(f"  - Test batches: {len(test_loader)}")

In [None]:
# Inspeccionar un batch de ejemplo
sample_batch = next(iter(train_loader))
src_batch, tgt_batch = sample_batch

print(f"Forma del batch source: {src_batch.shape}  # (batch_size, seq_len)")
print(f"Forma del batch target: {tgt_batch.shape}")

# Decodificar primera muestra
src_sample = src_batch[0]
tgt_sample = tgt_batch[0]

src_text = ' '.join(vocab_src.decode(src_sample.tolist()))
tgt_text = ' '.join(vocab_tgt.decode(tgt_sample.tolist()))

print(f"\nEjemplo de par de traducción:")
print(f"  Source (es): {src_text}")
print(f"  Target (en): {tgt_text}")

## 2. Configuración del Modelo

Configuramos el Transformer con los hiperparámetros óptimos.

In [None]:
# Configuración del modelo
model_config = TransformerConfig(
    vocab_size_src=len(vocab_src),
    vocab_size_tgt=len(vocab_tgt),
    d_model=256,
    n_heads=8,
    num_encoder_layers=4,
    num_decoder_layers=4,
    dim_feedforward=1024,
    dropout=0.1,
    max_seq_len=data_config['max_length'],
    pos_encoding_type='sinusoidal',  # Puede ser 'sinusoidal', 'rope', 'alibi'
    pad_idx=vocab_src.pad_idx
)

print("Configuración del modelo:")
print(f"  - d_model: {model_config.d_model}")
print(f"  - n_heads: {model_config.n_heads}")
print(f"  - encoder_layers: {model_config.num_encoder_layers}")
print(f"  - decoder_layers: {model_config.num_decoder_layers}")
print(f"  - dim_feedforward: {model_config.dim_feedforward}")
print(f"  - dropout: {model_config.dropout}")
print(f"  - pos_encoding: {model_config.pos_encoding_type}")

# Crear modelo
model = Transformer(model_config).to(device)

# Contar parámetros
total_params = count_parameters(model)
print(f"\nParámetros totales: {total_params:,}")
print(f"Tamaño estimado: {total_params * 4 / 1024**2:.2f} MB (float32)")

In [None]:
# Inspeccionar arquitectura
print("\nArquitectura del modelo:\n")
print(model)

## 3. Configuración de Entrenamiento

Configuramos el optimizador, scheduler y criterio de pérdida.

In [None]:
# Configuración de entrenamiento
train_config = {
    'epochs': 30,
    'lr': 1e-4,
    'betas': (0.9, 0.98),
    'eps': 1e-9,
    'weight_decay': 0.01,
    'warmup_steps': 1000,
    'min_lr': 1e-6,
    'label_smoothing': 0.1,
    'grad_clip': 1.0,
    'use_amp': True,  # Mixed precision
    'patience': 5,  # Early stopping
    'checkpoint_dir': '../checkpoints',
    'log_interval': 100
}

print("Configuración de entrenamiento:")
for key, value in train_config.items():
    print(f"  - {key}: {value}")

# Crear directorio de checkpoints
Path(train_config['checkpoint_dir']).mkdir(parents=True, exist_ok=True)

In [None]:
# Optimizador
optimizer = Adam(
    model.parameters(),
    lr=train_config['lr'],
    betas=train_config['betas'],
    eps=train_config['eps'],
    weight_decay=train_config['weight_decay']
)

# Learning rate scheduler (warmup + cosine decay)
total_steps = len(train_loader) * train_config['epochs']
scheduler = WarmupCosineScheduler(
    optimizer,
    warmup_steps=train_config['warmup_steps'],
    total_steps=total_steps,
    min_lr=train_config['min_lr']
)

# Criterio de pérdida (con label smoothing)
criterion = nn.CrossEntropyLoss(
    ignore_index=vocab_tgt.pad_idx,
    label_smoothing=train_config['label_smoothing']
)

# GradScaler para mixed precision
scaler = GradScaler() if train_config['use_amp'] else None

print(f"\nTotal steps: {total_steps:,}")
print(f"Warmup steps: {train_config['warmup_steps']:,}")
print(f"Steps per epoch: {len(train_loader):,}")

### Visualización del Learning Rate Schedule

In [None]:
# Simular learning rate schedule
lrs = []
for step in range(total_steps):
    if step < train_config['warmup_steps']:
        lr = train_config['lr'] * (step / train_config['warmup_steps'])
    else:
        progress = (step - train_config['warmup_steps']) / (total_steps - train_config['warmup_steps'])
        lr = train_config['min_lr'] + (train_config['lr'] - train_config['min_lr']) * \
             0.5 * (1 + np.cos(np.pi * progress))
    lrs.append(lr)

plt.figure(figsize=(12, 5))
plt.plot(lrs)
plt.axvline(x=train_config['warmup_steps'], color='red', linestyle='--', 
            linewidth=2, label='Fin warmup')
plt.xlabel('Training Step')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule (Warmup + Cosine Decay)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.tight_layout()
plt.show()

print(f"LR inicial: {lrs[0]:.2e}")
print(f"LR máximo (después de warmup): {lrs[train_config['warmup_steps']]:.2e}")
print(f"LR final: {lrs[-1]:.2e}")

## 4. Loop de Entrenamiento

Implementamos el entrenamiento con las siguientes características:
- Mixed Precision (AMP) para velocidad
- Gradient clipping para estabilidad
- Early stopping
- Checkpointing del mejor modelo

In [None]:
# Crear Trainer
trainer = Trainer(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
    scaler=scaler,
    grad_clip=train_config['grad_clip']
)

print("Trainer inicializado.")
print(f"  - Device: {device}")
print(f"  - Mixed precision: {train_config['use_amp']}")
print(f"  - Gradient clipping: {train_config['grad_clip']}")

In [None]:
# Entrenamiento completo
history = {
    'train_loss': [],
    'val_loss': [],
    'train_ppl': [],
    'val_ppl': [],
    'lr': []
}

best_val_loss = float('inf')
patience_counter = 0

print("\nIniciando entrenamiento...\n")

for epoch in range(train_config['epochs']):
    epoch_start = time.time()
    
    # Entrenamiento
    train_loss = trainer.train_epoch(train_loader, epoch)
    train_ppl = np.exp(train_loss)
    
    # Validación
    val_loss = trainer.validate(val_loader)
    val_ppl = np.exp(val_loss)
    
    # Learning rate actual
    current_lr = optimizer.param_groups[0]['lr']
    
    # Guardar historial
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_ppl'].append(train_ppl)
    history['val_ppl'].append(val_ppl)
    history['lr'].append(current_lr)
    
    epoch_time = time.time() - epoch_start
    
    # Logging
    print(f"Epoch {epoch+1}/{train_config['epochs']} ({epoch_time:.1f}s) - "
          f"Train Loss: {train_loss:.4f} (PPL: {train_ppl:.2f}) - "
          f"Val Loss: {val_loss:.4f} (PPL: {val_ppl:.2f}) - "
          f"LR: {current_lr:.2e}")
    
    # Guardar mejor modelo
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        
        checkpoint_path = Path(train_config['checkpoint_dir']) / 'best_model.pt'
        save_checkpoint(
            model=model,
            optimizer=optimizer,
            epoch=epoch,
            step=trainer.global_step,
            loss=val_loss,
            config=model_config,
            path=checkpoint_path
        )
        print(f"  -> Mejor modelo guardado (val_loss: {val_loss:.4f})")
    else:
        patience_counter += 1
        print(f"  -> No mejora ({patience_counter}/{train_config['patience']})")
    
    # Early stopping
    if patience_counter >= train_config['patience']:
        print(f"\nEarly stopping después de {epoch+1} epochs.")
        break
    
    print()

print("\nEntrenamiento completado!")
print(f"Mejor val_loss: {best_val_loss:.4f} (PPL: {np.exp(best_val_loss):.2f})")

## 5. Visualización de Resultados

In [None]:
# Curvas de pérdida
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Loss
axes[0].plot(history['train_loss'], label='Train', marker='o', markersize=4)
axes[0].plot(history['val_loss'], label='Validation', marker='s', markersize=4)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Perplexity
axes[1].plot(history['train_ppl'], label='Train', marker='o', markersize=4)
axes[1].plot(history['val_ppl'], label='Validation', marker='s', markersize=4)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Perplexity')
axes[1].set_title('Training and Validation Perplexity')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Learning rate
axes[2].plot(history['lr'], marker='o', markersize=4, color='green')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Learning Rate')
axes[2].set_title('Learning Rate Schedule')
axes[2].grid(True, alpha=0.3)
axes[2].set_yscale('log')

plt.tight_layout()
plt.savefig('../outputs/training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("Gráficas guardadas en ../outputs/training_curves.png")

In [None]:
# Estadísticas finales
print("\n" + "="*80)
print("ESTADÍSTICAS FINALES")
print("="*80)
print(f"\nÉpocas entrenadas: {len(history['train_loss'])}")
print(f"\nPérdida:")
print(f"  - Mejor train loss: {min(history['train_loss']):.4f}")
print(f"  - Mejor val loss: {min(history['val_loss']):.4f}")
print(f"  - Pérdida final train: {history['train_loss'][-1]:.4f}")
print(f"  - Pérdida final val: {history['val_loss'][-1]:.4f}")
print(f"\nPerplejidad:")
print(f"  - Mejor train PPL: {min(history['train_ppl']):.2f}")
print(f"  - Mejor val PPL: {min(history['val_ppl']):.2f}")
print(f"  - PPL final train: {history['train_ppl'][-1]:.2f}")
print(f"  - PPL final val: {history['val_ppl'][-1]:.2f}")
print(f"\nLearning rate:")
print(f"  - LR inicial: {history['lr'][0]:.2e}")
print(f"  - LR final: {history['lr'][-1]:.2e}")
print(f"\nCheckpoint:")
print(f"  - Mejor modelo: {train_config['checkpoint_dir']}/best_model.pt")
print(f"  - Val loss: {best_val_loss:.4f}")
print(f"  - Val PPL: {np.exp(best_val_loss):.2f}")

## 6. Guardar Vocabularios y Configuración

In [None]:
# Guardar vocabularios
vocab_src.save('../data/processed/vocab_src.pkl')
vocab_tgt.save('../data/processed/vocab_tgt.pkl')

print("Vocabularios guardados:")
print(f"  - Source: ../data/processed/vocab_src.pkl")
print(f"  - Target: ../data/processed/vocab_tgt.pkl")

# Guardar configuración
import json

full_config = {
    'model': {
        'vocab_size_src': len(vocab_src),
        'vocab_size_tgt': len(vocab_tgt),
        'd_model': model_config.d_model,
        'n_heads': model_config.n_heads,
        'num_encoder_layers': model_config.num_encoder_layers,
        'num_decoder_layers': model_config.num_decoder_layers,
        'dim_feedforward': model_config.dim_feedforward,
        'dropout': model_config.dropout,
        'max_seq_len': model_config.max_seq_len,
        'pos_encoding_type': model_config.pos_encoding_type
    },
    'data': data_config,
    'training': train_config,
    'results': {
        'best_val_loss': best_val_loss,
        'best_val_ppl': np.exp(best_val_loss),
        'total_epochs': len(history['train_loss']),
        'total_params': total_params
    }
}

# Convertir objetos Path a string
full_config['training']['checkpoint_dir'] = str(full_config['training']['checkpoint_dir'])

with open('../outputs/training_config.json', 'w') as f:
    json.dump(full_config, f, indent=2)

print("\nConfiguración guardada en ../outputs/training_config.json")

## Resumen

En este notebook hemos:

1. Preparado el dataset Tatoeba español-inglés
2. Configurado el modelo Transformer con 4 capas encoder/decoder
3. Implementado entrenamiento con:
   - Mixed Precision (AMP) para eficiencia
   - Warmup + Cosine LR schedule
   - Label smoothing
   - Gradient clipping
   - Early stopping
4. Visualizado curvas de entrenamiento
5. Guardado del mejor modelo y configuración

El modelo está listo para evaluación y generación de traducciones en los siguientes notebooks.