# SPR 2026 - BERTimbau + LoRA (Offline)

**Fine-tuning eficiente com LoRA implementado manualmente (SEM peft)**

- LoRA (r=16, alpha=32) implementado do zero
- **100% OFFLINE** - nao precisa de pip install
- Menor uso de memoria (~10% dos parametros treinaveis)
- Tempo esperado: ~15-20 min

---
**CONFIGURACAO KAGGLE:**

1. **Add Input** -> **Models** -> buscar `bertimbau-ptbr-complete` (fabianofilho)
   - Ou `bertimbau-large-ptbr` para versao Large
2. **Settings** -> Internet -> **OFF**
3. **Settings** -> Accelerator -> **GPU T4 x2**

---

In [None]:
# ===== SPR 2026 - BERTIMBAU + LORA (OFFLINE) =====

# ==== SETUP E IMPORTS ====
print("[1/8] Configurando ambiente...")
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LR = 3e-4
NUM_CLASSES = 7

# LoRA Config
LORA_R = 16          # rank
LORA_ALPHA = 32      # scaling
LORA_DROPOUT = 0.1

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# ==== AUTO-DETECTAR PATH DO MODELO ====
def find_model_path():
    """Encontra automaticamente o path do modelo (busca recursiva ate 10 niveis)."""
    base = '/kaggle/input'
    
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    
    return search_dir(base)

MODEL_PATH = find_model_path()

if MODEL_PATH is None:
    print("\nModelo nao encontrado. Estrutura de /kaggle/input:")
    def show_tree(path, prefix="", depth=0):
        if depth > 4:
            return
        try:
            items = os.listdir(path)[:10]
            for item in items:
                full = os.path.join(path, item)
                print(f"{prefix}{item}")
                if os.path.isdir(full):
                    show_tree(full, prefix + "  ", depth + 1)
        except:
            pass
    show_tree('/kaggle/input')
    raise FileNotFoundError("Adicione o modelo BERTimbau ao notebook!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# ==== LORA LAYER (IMPLEMENTACAO MANUAL) ====
print("[2/8] Definindo LoRA layers...")

class LoRALayer(nn.Module):
    """LoRA layer implementado do zero."""
    def __init__(self, in_features, out_features, r=16, alpha=32, dropout=0.1):
        super().__init__()
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        
        # Matrizes A e B do LoRA
        self.lora_A = nn.Linear(in_features, r, bias=False)
        self.lora_B = nn.Linear(r, out_features, bias=False)
        self.dropout = nn.Dropout(dropout)
        
        # Inicializacao
        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B.weight)
    
    def forward(self, x):
        return self.lora_B(self.lora_A(self.dropout(x))) * self.scaling

class LinearWithLoRA(nn.Module):
    """Linear layer com LoRA."""
    def __init__(self, linear, r=16, alpha=32, dropout=0.1):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, r, alpha, dropout
        )
        # Congelar pesos originais
        for param in self.linear.parameters():
            param.requires_grad = False
    
    def forward(self, x):
        return self.linear(x) + self.lora(x)

print(f'LoRA config: r={LORA_R}, alpha={LORA_ALPHA}')

In [None]:
# ==== MODELO COM LORA ====
print("[3/8] Carregando modelo com LoRA...")

class BertWithLoRA(nn.Module):
    def __init__(self, model_path, num_labels, r=16, alpha=32, dropout=0.1):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_path, local_files_only=True)
        hidden_size = self.bert.config.hidden_size
        
        # Aplicar LoRA nas camadas de atencao
        for layer in self.bert.encoder.layer:
            # Query e Value (padrao do paper)
            layer.attention.self.query = LinearWithLoRA(
                layer.attention.self.query, r, alpha, dropout
            )
            layer.attention.self.value = LinearWithLoRA(
                layer.attention.self.value, r, alpha, dropout
            )
        
        # Classificador
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # CLS token
        return self.classifier(pooled)

model = BertWithLoRA(MODEL_PATH, NUM_CLASSES, LORA_R, LORA_ALPHA, LORA_DROPOUT)
model.to(device)

# Contar parametros treinaveis
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total params: {total_params:,}')
print(f'Trainable params: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)')

In [None]:
# ==== CARREGAR DADOS ====
print("[4/8] Carregando dados...")
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'Train: {train_df.shape}, Test: {test_df.shape}')

In [None]:
# ==== DATASET CLASS ====
print("[5/8] Preparando dataset...")

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print('Tokenizer carregado!')

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].values, train_df['label'].values,
    test_size=0.1, random_state=SEED, stratify=train_df['label']
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_ds = TextDataset(test_df['text'].values, None, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f'Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}')

In [None]:
# ==== TREINAMENTO ====
print("[6/8] Treinando modelo...")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad], lr=LR
)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(outputs.argmax(dim=1).cpu().numpy())
            if 'labels' in batch:
                labels.extend(batch['labels'].numpy())
    if labels:
        return f1_score(labels, preds, average='macro')
    return preds

best_f1 = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    val_f1 = evaluate(model, val_loader)
    print(f'Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Val F1={val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), '/kaggle/working/best_lora_model.pt')
        print(f'  -> Melhor modelo salvo! F1={best_f1:.4f}')

print(f'\nMelhor F1 validacao: {best_f1:.4f}')

In [None]:
# ==== PREDICAO ====
print("[7/8] Gerando predicoes...")
model.load_state_dict(torch.load('/kaggle/working/best_lora_model.pt'))
predictions = evaluate(model, test_loader)
print(f'Predicoes geradas: {len(predictions)}')

In [None]:
# ==== SUBMISSAO ====
print("[8/8] Criando submissao...")
submission = pd.DataFrame({
    'id': test_df['id'],
    'label': predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print('Submissao salva!')
print(submission['label'].value_counts().sort_index())