# SPR 2026 - BioBERTpt + Focal Loss v2

**Melhorias sobre v1 (0.72480):**

- ‚úÖ **Focal Loss** (Œ≥=2) - mesma estrat√©gia do BERTimbau campe√£o
- ‚úÖ Mais epochs (7 vs 5)
- ‚úÖ Learning rate ajustado (3e-5)
- ‚úÖ Warmup ratio aumentado (15%)
- ‚úÖ Gradient accumulation (2 steps)

**Objetivo: superar 0.72480 (competir com top 5)**

---
## üì• MODELO NECESS√ÅRIO

**Notebook de download:**
1. Rode `models/bert/download_biobertpt.ipynb` no Kaggle com **Internet ON**
2. Save Version ‚Üí Save & Run All (Commit)
3. Aqui: Add Input ‚Üí **Your Work** ‚Üí selecione o notebook `download-biobertpt`

---
**CONFIGURA√á√ÉO KAGGLE:**
1. Settings ‚Üí Internet ‚Üí **OFF**
2. Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**
3. Add Input ‚Üí adicione o modelo (Your Work)
4. **IMPORTANTE:** Execute "Run All" ap√≥s commit
---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 16
GRADIENT_ACCUMULATION = 2  # Efetivo: batch_size * 2 = 32
EPOCHS = 7                  # Aumentado de 5
LR = 3e-5                   # Aumentado de 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0
WARMUP_RATIO = 0.15

DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'

# Busca autom√°tica do modelo
def find_model_path():
    base = '/kaggle/input'
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    return search_dir(base)

MODEL_PATH = find_model_path()
print(f'Modelo encontrado: {MODEL_PATH}')

if MODEL_PATH is None:
    raise FileNotFoundError("Adicione o modelo via Add Input ‚Üí Your Work!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# =============================================================================
# FOCAL LOSS (CHAVE DO SUCESSO DO BERTimbau!)
# =============================================================================
class FocalLoss(nn.Module):
    """Focal Loss - mesma implementa√ß√£o usada no BERTimbau campe√£o (0.79696)"""
    
    def __init__(self, gamma=2.0, weight=None, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction
        
    def forward(self, input, target):
        ce_loss = F.cross_entropy(input, target, weight=self.weight, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

print('‚úÖ Focal Loss (Œ≥=2) configurado!')

In [None]:
# Carregar dados
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')
print(f'\nDistribui√ß√£o:')
print(train_df['target'].value_counts().sort_index())

# Calcular class weights
class_counts = train_df['target'].value_counts().sort_index().values
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
print(f'\nClass weights: {class_weights.cpu().numpy().round(2)}')

In [None]:
# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].values,
    train_df['target'].values,
    test_size=0.15,
    random_state=SEED,
    stratify=train_df['target']
)

print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# Tokenizer e Dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            
        return item

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_df['report'].values)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
# Modelo BioBERTpt
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=NUM_CLASSES,
    local_files_only=True
).to(device)

print(f'Params: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# Optimizer e Scheduler
criterion = FocalLoss(gamma=FOCAL_GAMMA, weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

total_steps = (len(train_loader) // GRADIENT_ACCUMULATION) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f'Total steps: {total_steps}, Warmup: {warmup_steps}')

In [None]:
def train_epoch(model, loader, criterion, optimizer, scheduler, accumulation_steps):
    model.train()
    total_loss = 0
    preds, targets = [], []
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(tqdm(loader, desc='Training')):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss = loss / accumulation_steps
        
        loss.backward()
        
        if (step + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        preds.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
        targets.extend(labels.cpu().numpy())
    
    return total_loss / len(loader), f1_score(targets, preds, average='macro')


def evaluate(model, loader):
    model.eval()
    preds, targets = [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    return f1_score(targets, preds, average='macro')

In [None]:
best_f1 = 0

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch+1}/{EPOCHS}')
    
    train_loss, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, scheduler, GRADIENT_ACCUMULATION
    )
    val_f1 = evaluate(model, val_loader)
    
    print(f'Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')
        print(f'‚úÖ Novo melhor modelo! F1: {best_f1:.4f}')

print(f'\nüèÜ Melhor F1: {best_f1:.4f}')

In [None]:
# Predi√ß√µes
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = outputs.logits.argmax(dim=-1).cpu().numpy()
        predictions.extend(preds)

# Submiss√£o
submission = pd.DataFrame({'ID': test_df['ID'], 'target': predictions})
submission.to_csv('submission.csv', index=False)
print('‚úÖ submission.csv criado!')
print(submission['target'].value_counts().sort_index())