# SPR 2026 - BERTimbau-Large + LoRA (Offline)

**Fine-tuning eficiente com LoRA implementado manualmente (SEM peft)**

- ✅ LoRA (r=16, alpha=32) implementado do zero
- ✅ **100% OFFLINE** - não precisa de pip install
- ✅ Menor uso de memória (~10% dos parâmetros treináveis)
- ✅ Tempo esperado: ~15-20 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF** ✅
2. Settings → Accelerator → **GPU T4 x2**
3. Add Data → Models → `neuralmind/bert-large-portuguese-cased`
4. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# ===== SPR 2026 - BERTIMBAU-LARGE + LORA (OFFLINE - SEM PEFT) =====

# ==== SETUP E IMPORTS ====
print("[1/8] Configurando ambiente...")
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 3e-4
NUM_CLASSES = 7

# LoRA Config
LORA_R = 16          # rank
LORA_ALPHA = 32      # scaling
LORA_DROPOUT = 0.1

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# Paths para modelo (verificar qual está disponível)
MODEL_PATHS = [
    '/kaggle/input/bert-large-portuguese-cased',
    '/kaggle/input/bertimbau-large-portuguese-cased',
    '/kaggle/input/neuralmind-bert-large-portuguese-cased',
]

MODEL_PATH = None
for path in MODEL_PATHS:
    if os.path.exists(path):
        MODEL_PATH = path
        break

if MODEL_PATH is None:
    print("\n⚠️ Modelo não encontrado. Datasets disponíveis:")
    for item in os.listdir('/kaggle/input'):
        print(f"  - {item}")
    raise FileNotFoundError("Adicione neuralmind/bert-large-portuguese-cased!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Modelo: {MODEL_PATH}')
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# ==== LORA IMPLEMENTATION (SEM BIBLIOTECA EXTERNA) ====
print("[2/8] Implementando LoRA...")

class LoRALayer(nn.Module):
    """
    LoRA: Low-Rank Adaptation of Large Language Models
    Paper: https://arxiv.org/abs/2106.09685
    
    W' = W + BA onde B ∈ R^(d×r), A ∈ R^(r×k)
    r << min(d, k) para eficiência
    """
    def __init__(self, original_layer, r=16, alpha=32, dropout=0.1):
        super().__init__()
        self.original_layer = original_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        
        # Dimensões
        in_features = original_layer.in_features
        out_features = original_layer.out_features
        
        # Matrizes LoRA (A e B)
        # A: projeta para espaço de baixa dimensão
        # B: projeta de volta para dimensão original
        self.lora_A = nn.Parameter(torch.zeros(r, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, r))
        
        # Inicialização (A com Kaiming, B com zeros)
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)
        
        self.dropout = nn.Dropout(dropout)
        
        # Congelar pesos originais
        for param in self.original_layer.parameters():
            param.requires_grad = False
    
    def forward(self, x):
        # Output original (frozen)
        original_output = self.original_layer(x)
        
        # LoRA path: x → dropout → A → B → scale
        lora_output = self.dropout(x)
        lora_output = F.linear(lora_output, self.lora_A)  # x @ A.T
        lora_output = F.linear(lora_output, self.lora_B)  # (x @ A.T) @ B.T
        lora_output = lora_output * self.scaling
        
        return original_output + lora_output


def apply_lora_to_model(model, r=16, alpha=32, dropout=0.1, target_modules=['query', 'key', 'value']):
    """
    Aplica LoRA às camadas especificadas do modelo BERT.
    Retorna o número de parâmetros treináveis.
    """
    lora_layers = []
    
    for name, module in model.named_modules():
        # Verificar se é uma camada alvo
        if any(target in name for target in target_modules):
            if isinstance(module, nn.Linear):
                # Encontrar o módulo pai e o nome do atributo
                parts = name.split('.')
                parent = model
                for part in parts[:-1]:
                    parent = getattr(parent, part)
                
                # Substituir por LoRALayer
                attr_name = parts[-1]
                lora_layer = LoRALayer(module, r=r, alpha=alpha, dropout=dropout)
                setattr(parent, attr_name, lora_layer)
                lora_layers.append(name)
    
    # Contar parâmetros
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"LoRA aplicado a {len(lora_layers)} camadas")
    print(f"Parâmetros totais: {total_params:,}")
    print(f"Parâmetros treináveis: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")
    
    return lora_layers

In [None]:
# ==== MODELO COM CLASSIFICADOR ====

class BertWithLoRA(nn.Module):
    def __init__(self, model_path, num_classes, r=16, alpha=32, dropout=0.1):
        super().__init__()
        
        # Carregar BERT base
        self.bert = AutoModel.from_pretrained(model_path)
        hidden_size = self.bert.config.hidden_size
        
        # Aplicar LoRA às camadas de atenção
        apply_lora_to_model(
            self.bert, 
            r=r, 
            alpha=alpha, 
            dropout=dropout,
            target_modules=['query', 'key', 'value']
        )
        
        # Classificador (treinável)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return logits

In [None]:
# ==== CARREGAR DADOS ====
print("[3/8] Carregando dados...")
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'Train: {train_df.shape}, Test: {test_df.shape}')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].values,
    train_df['target'].values,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df['target']
)
print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# ==== DATASET ====
print("[4/8] Preparando datasets...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_df['report'].values)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f'Batches - Train: {len(train_loader)}, Val: {len(val_loader)}, Test: {len(test_loader)}')

In [None]:
# ==== CRIAR MODELO ====
print("[5/8] Criando modelo com LoRA...")
model = BertWithLoRA(
    MODEL_PATH, 
    NUM_CLASSES, 
    r=LORA_R, 
    alpha=LORA_ALPHA, 
    dropout=LORA_DROPOUT
)
model = model.to(device)

# Optimizer e Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr=LR, 
    weight_decay=0.01
)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(0.1 * total_steps), 
    num_training_steps=total_steps
)

In [None]:
# ==== FUNÇÕES DE TREINO ====
def train_epoch(model, loader, criterion, optimizer, scheduler):
    model.train()
    total_loss = 0
    preds, targets = [], []
    
    for batch in tqdm(loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        preds.extend(logits.argmax(dim=-1).cpu().numpy())
        targets.extend(labels.cpu().numpy())
    
    return total_loss / len(loader), f1_score(targets, preds, average='macro')


def evaluate(model, loader):
    model.eval()
    preds, targets = [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            preds.extend(logits.argmax(dim=-1).cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    return f1_score(targets, preds, average='macro')

In [None]:
# ==== TREINAMENTO ====
print("[6/8] Treinando...")
best_f1 = 0

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch+1}/{EPOCHS}')
    train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, scheduler)
    val_f1 = evaluate(model, val_loader)
    
    print(f'Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_lora_model.pt')
        print(f'Novo melhor modelo! F1: {best_f1:.4f}')

print(f'\nMelhor F1: {best_f1:.4f}')

In [None]:
# ==== CARREGAR MELHOR MODELO ====
print("[7/8] Carregando melhor modelo...")
model.load_state_dict(torch.load('best_lora_model.pt'))
model.eval()

# ==== PREDIÇÃO ====
predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=-1).cpu().numpy()
        predictions.extend(preds)

In [None]:
# ==== SUBMISSÃO ====
print("[8/8] Criando submissão...")
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})
submission.to_csv('submission.csv', index=False)

print("="*60)
print("✅ CONCLUÍDO - submission.csv criado!")
print("="*60)
print(f"Melhor F1 validação: {best_f1:.4f}")
print(f"\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())