# 04 - Transformers (BERT-based Models)

**Notebook de referência para modelos Transformer encoder-only**

Este notebook demonstra o pipeline completo para fine-tuning de modelos BERT-like:
- BERTimbau (PT-BR)
- mDeBERTa-v3 (multilingual)
- DistilBERT (compacto)
- XLM-RoBERTa (multilingual)
- ModernBERT (Flash Attention + RoPE)
- BioBERTpt (domínio médico)

---

## Configuração Kaggle

| Modelo | Kaggle Input | Tamanho |
|--------|--------------|----------|
| BERTimbau base | `neuralmind/bert-base-portuguese-cased` | ~440MB |
| BERTimbau large | `neuralmind/bert-large-portuguese-cased` | ~1.3GB |
| mDeBERTa-v3 | `microsoft/mdeberta-v3-base` | ~560MB |
| DistilBERT | `distilbert-base-multilingual-cased` | ~280MB |
| XLM-RoBERTa | `xlm-roberta-large` | ~1.4GB |
| ModernBERT | `answerdotai/ModernBERT-base` | ~450MB |
| BioBERTpt | `pucpr/biobertpt-all` | ~440MB |

**Settings:**
- Internet → **OFF**
- Accelerator → **GPU T4 x2**
- Add Data → Models → escolher modelo acima

In [None]:
# =============================================================================
# SETUP E IMPORTS
# =============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURAÇÕES
# =============================================================================
SEED = 42
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7

# Escolher modelo (descomentar o desejado)
# =========================================
MODEL_NAME = 'bertimbau-base'  # <-- ALTERAR AQUI

MODEL_CONFIGS = {
    'bertimbau-base': {
        'path': '/kaggle/input/bert-base-portuguese-cased',
        'description': 'BERTimbau base - melhor modelo PT-BR'
    },
    'bertimbau-large': {
        'path': '/kaggle/input/bert-large-portuguese-cased',
        'description': 'BERTimbau large - mais parâmetros'
    },
    'mdeberta': {
        'path': '/kaggle/input/mdeberta-v3-base',
        'description': 'mDeBERTa-v3 - disentangled attention'
    },
    'distilbert': {
        'path': '/kaggle/input/distilbert-base-multilingual-cased',
        'description': 'DistilBERT - 40% menor, 60% mais rápido'
    },
    'xlm-roberta': {
        'path': '/kaggle/input/xlm-roberta-large',
        'description': 'XLM-RoBERTa large - multilingual robusto'
    },
    'modernbert': {
        'path': '/kaggle/input/modernbert-base',
        'description': 'ModernBERT - Flash Attention + RoPE'
    },
    'biobertpt': {
        'path': '/kaggle/input/biobertpt-all',
        'description': 'BioBERTpt - domínio biomédico PT-BR'
    },
}

config = MODEL_CONFIGS[MODEL_NAME]
MODEL_PATH = config['path']

# Verificar se modelo existe
if not os.path.exists(MODEL_PATH):
    print(f"⚠️ Modelo '{MODEL_NAME}' não encontrado em {MODEL_PATH}")
    print("Adicione o modelo como Input no Kaggle!")
else:
    print(f"✓ Modelo: {MODEL_NAME}")
    print(f"✓ Path: {MODEL_PATH}")
    print(f"✓ {config['description']}")

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'\nDevice: {device}')

torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# =============================================================================
# CARREGAR DADOS
# =============================================================================
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')
print(f'\nDistribuição das classes:')
print(train_df['target'].value_counts().sort_index())

In [None]:
# =============================================================================
# SPLIT TREINO/VALIDAÇÃO
# =============================================================================
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.1,
    stratify=train_df['target'],
    random_state=SEED
)

print(f'Train: {len(train_texts)}')
print(f'Val: {len(val_texts)}')

In [None]:
# =============================================================================
# DATASET CLASS
# =============================================================================
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Carregar tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print(f'Tokenizer carregado: {tokenizer.__class__.__name__}')

# Criar datasets
train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
test_ds = TextDataset(test_df['report'].tolist(), None, tokenizer, MAX_LENGTH)

print(f'Train dataset: {len(train_ds)}')
print(f'Val dataset: {len(val_ds)}')
print(f'Test dataset: {len(test_ds)}')

## Opção 1: Trainer API (Recomendado)

Usar HuggingFace Trainer para treinamento simplificado.

In [None]:
# =============================================================================
# CARREGAR MODELO
# =============================================================================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH, 
    num_labels=NUM_CLASSES,
    trust_remote_code=True  # Necessário para ModernBERT
)

print(f'Modelo carregado: {model.__class__.__name__}')
print(f'Parâmetros: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# =============================================================================
# MÉTRICAS
# =============================================================================
def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    f1 = f1_score(eval_pred.label_ids, preds, average='macro')
    return {'f1_macro': f1}

In [None]:
# =============================================================================
# TRAINING ARGUMENTS
# =============================================================================
training_args = TrainingArguments(
    output_dir='/kaggle/working/model',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    report_to='none',
    seed=SEED,
    logging_steps=50,
)

In [None]:
# =============================================================================
# TRAINER
# =============================================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Treinar
print(f'\nIniciando treinamento com {MODEL_NAME}...')
trainer.train()

In [None]:
# =============================================================================
# AVALIAÇÃO
# =============================================================================
val_results = trainer.evaluate()
print(f"\nResultados Validação:")
print(f"F1-Macro: {val_results['eval_f1_macro']:.4f}")

In [None]:
# =============================================================================
# PREDIÇÃO
# =============================================================================
test_preds = trainer.predict(test_ds)
predictions = np.argmax(test_preds.predictions, axis=1)

print(f'Predições geradas: {len(predictions)}')

In [None]:
# =============================================================================
# SUBMISSÃO
# =============================================================================
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})
submission.to_csv('submission.csv', index=False)

print('✅ submission.csv criado!')
print(f'\nDistribuição das predições:')
print(submission['target'].value_counts().sort_index())

---

## Opção 2: Training Loop Manual

Para mais controle sobre o treinamento (ex: Focal Loss, custom schedulers).

In [None]:
# =============================================================================
# TRAINING LOOP MANUAL (OPCIONAL)
# =============================================================================
# Descomentar para usar em vez do Trainer

'''
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# DataLoaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

# Optimizer e Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(0.1 * total_steps), 
    num_training_steps=total_steps
)

# Loss (pode usar Focal Loss para desbalanceamento)
criterion = nn.CrossEntropyLoss()

model.to(device)
best_f1 = 0

for epoch in range(EPOCHS):
    # Training
    model.train()
    train_loss = 0
    
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=-1).cpu().numpy()
            
            val_preds.extend(preds)
            val_labels.extend(batch['labels'].numpy())
    
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    print(f'Epoch {epoch+1} - Loss: {train_loss/len(train_loader):.4f} - Val F1: {val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')

print(f'\nMelhor F1: {best_f1:.4f}')
'''

---

## Técnicas Avançadas

### Focal Loss (para desbalanceamento)

```python
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()
```

### LoRA (fine-tuning eficiente)
Ver notebook `submit/transformers/submit_bertimbau_lora_offline.ipynb`

### Mean Pooling (vs CLS token)

```python
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).float()
    sum_hidden = torch.sum(last_hidden_state * mask, dim=1)
    sum_mask = mask.sum(dim=1).clamp(min=1e-9)
    return sum_hidden / sum_mask
```