# SPR 2026 - Custom Transformer Encoder

**Transformer customizado treinado do zero**

- ✅ 4 encoder layers, 8 heads
- ✅ Positional encoding + Self-attention extra
- ✅ Tempo esperado: ~10-15 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2**
3. Add Data → Models → `neuralmind/bert-base-portuguese-cased` (apenas tokenizer)
4. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import math
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-4
NUM_CLASSES = 7

# Transformer Config
D_MODEL = 256
N_HEADS = 8
N_ENCODER_LAYERS = 4
D_FF = 512
DROPOUT = 0.1

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
TOKENIZER_PATH = '/kaggle/input/bert-base-portuguese-cased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# Carregar dados
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')

In [None]:
# Split treino/validação
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].values,
    train_df['target'].values,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df['target']
)

print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
VOCAB_SIZE = tokenizer.vocab_size
PAD_IDX = tokenizer.pad_token_id

print(f'Vocab size: {VOCAB_SIZE}')

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            
        return item

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_df['report'].values)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f'Batches - Train: {len(train_loader)}, Val: {len(val_loader)}, Test: {len(test_loader)}')

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class ExtraSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        attn_output, attn_weights = self.self_attn(
            query=x, key=x, value=x,
            key_padding_mask=mask
        )
        x = self.norm(x + self.dropout(attn_output))
        return x, attn_weights


class TransformerEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=8, n_encoder_layers=4,
                 d_ff=512, num_classes=7, max_len=256, dropout=0.1, pad_idx=0):
        super().__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_encoder_layers)
        
        self.extra_attention = ExtraSelfAttention(d_model, n_heads, dropout)
        
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )
        
        self._init_weights()
        
    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        padding_mask = (attention_mask == 0) if attention_mask is not None else None
        
        x = self.transformer_encoder(x, src_key_padding_mask=padding_mask)
        x, _ = self.extra_attention(x, mask=padding_mask)
        
        if attention_mask is not None:
            mask_expanded = attention_mask.unsqueeze(-1).float()
            x = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1).clamp(min=1e-9)
        else:
            x = x.mean(dim=1)
        
        return self.classifier(x)

model = TransformerEncoderClassifier(
    vocab_size=VOCAB_SIZE,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    n_encoder_layers=N_ENCODER_LAYERS,
    d_ff=D_FF,
    num_classes=NUM_CLASSES,
    max_len=MAX_LEN,
    dropout=DROPOUT,
    pad_idx=PAD_IDX
).to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f'Total params: {total_params:,}')

In [None]:
# Class weights para imbalance
class_counts = train_df['target'].value_counts().sort_index().values
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * NUM_CLASSES
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=LR, steps_per_epoch=len(train_loader), epochs=EPOCHS
)

In [None]:
def train_epoch(model, loader, criterion, optimizer, scheduler):
    model.train()
    total_loss = 0
    preds, targets = [], []
    
    for batch in tqdm(loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        preds.extend(logits.argmax(dim=-1).cpu().numpy())
        targets.extend(labels.cpu().numpy())
    
    return total_loss / len(loader), f1_score(targets, preds, average='macro')


def evaluate(model, loader):
    model.eval()
    preds, targets = [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            preds.extend(logits.argmax(dim=-1).cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    return f1_score(targets, preds, average='macro')

In [None]:
best_f1 = 0

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch+1}/{EPOCHS}')
    
    train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, scheduler)
    val_f1 = evaluate(model, val_loader)
    
    print(f'Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')
        print(f'Novo melhor modelo! F1: {best_f1:.4f}')

print(f'\nMelhor F1: {best_f1:.4f}')

In [None]:
# Carregar melhor modelo e gerar predições
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=-1).cpu().numpy()
        predictions.extend(preds)

print(f'Total predições: {len(predictions)}')

In [None]:
# Criar submissão
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())