# SPR 2026 - BERTimbau + Focal Loss v3

**Vers√£o melhorada do melhor modelo (0.79696)**

Estrat√©gias adicionais:
- ‚úÖ Focal Loss com alpha por classe (pesos baseados em frequ√™ncia)
- ‚úÖ Threshold tuning por classe (0.35 para classes 5 e 6)
- ‚úÖ Label Smoothing (0.1)
- ‚úÖ Gradient accumulation (2 steps)

---
## üì• MODELO - `models/download_bertimbau.ipynb` ou `download_bertimbau_large.ipynb`

**Kaggle Models:** Add Input ‚Üí Models ‚Üí `bertimbau-ptbr-complete` (fabianofilho)

---
**CONFIGURA√á√ÉO KAGGLE:**
1. Settings ‚Üí Internet ‚Üí **OFF**
2. Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**

---

In [None]:
# ===== FLAGS DE CONFIGURA√á√ÉO =====
USE_CLASS_ALPHA = True      # Alpha por classe (vs alpha fixo)
USE_THRESHOLD_TUNING = True # Threshold diferente por classe
USE_LABEL_SMOOTHING = True  # Label smoothing 0.1
GRADIENT_ACCUMULATION = 2   # Accumulate gradients

# ===== SETUP E IMPORTS =====
print("[1/8] Configurando ambiente...")
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

SEED = 42
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0

DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'

# ==== AUTO-DETECTAR PATH DO MODELO ====
def find_model_path():
    """Encontra automaticamente o path do modelo."""
    base = '/kaggle/input'
    
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    
    return search_dir(base)

MODEL_PATH = find_model_path()

if MODEL_PATH is None:
    raise FileNotFoundError("Adicione o modelo BERTimbau ao notebook!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# ==== CARREGAR DADOS ====
print("[2/8] Carregando dados...")
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'Train: {train_df.shape}, Test: {test_df.shape}')

# Calcular frequ√™ncia das classes para alpha
class_counts = train_df['target'].value_counts().sort_index()
print(f'\nDistribui√ß√£o das classes:\n{class_counts}')

# Alpha inversamente proporcional √† frequ√™ncia
if USE_CLASS_ALPHA:
    total = len(train_df)
    class_alpha = torch.tensor([total / (NUM_CLASSES * c) for c in class_counts.values], dtype=torch.float32)
    class_alpha = class_alpha / class_alpha.sum()  # Normalizar
    print(f'\nAlpha por classe: {class_alpha.numpy().round(3)}')
else:
    class_alpha = None

In [None]:
# ==== FOCAL LOSS COM ALPHA POR CLASSE ====
print("[3/8] Definindo Focal Loss...")

class FocalLossWeighted(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, label_smoothing=0.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha  # tensor [num_classes] ou None
        self.gamma = gamma
        self.label_smoothing = label_smoothing
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        # Label smoothing
        if self.label_smoothing > 0:
            n_classes = inputs.size(-1)
            targets_smooth = torch.zeros_like(inputs).scatter_(
                1, targets.unsqueeze(1), 1.0
            )
            targets_smooth = targets_smooth * (1 - self.label_smoothing) + self.label_smoothing / n_classes
            ce_loss = -(targets_smooth * F.log_softmax(inputs, dim=-1)).sum(dim=-1)
        else:
            ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        # Alpha per class
        if self.alpha is not None:
            alpha_t = self.alpha.to(inputs.device)[targets]
            focal_loss = alpha_t * focal_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

label_smooth = 0.1 if USE_LABEL_SMOOTHING else 0.0
criterion = FocalLossWeighted(alpha=class_alpha, gamma=FOCAL_GAMMA, label_smoothing=label_smooth)
print(f'Focal Loss: gamma={FOCAL_GAMMA}, label_smoothing={label_smooth}')

In [None]:
# ==== DATASET CLASS ====
print("[4/8] Preparando dataset...")

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print('Tokenizer carregado!')

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].values, train_df['target'].values,
    test_size=0.1, random_state=SEED, stratify=train_df['target']
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_ds = TextDataset(test_df['report'].values, None, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f'Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}')

In [None]:
# ==== MODELO ====
print("[5/8] Carregando modelo...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=NUM_CLASSES,
    local_files_only=True
)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS // GRADIENT_ACCUMULATION
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

print(f'Modelo carregado! Parametros: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# ==== THRESHOLD TUNING ====
if USE_THRESHOLD_TUNING:
    # Classes 5 e 6 s√£o minorit√°rias - threshold mais baixo para serem mais sens√≠veis
    CLASS_THRESHOLDS = {0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5, 4: 0.5, 5: 0.35, 6: 0.35}
    print(f'Thresholds por classe: {CLASS_THRESHOLDS}')

def predict_with_threshold(logits, thresholds=None):
    """Predi√ß√£o com threshold customizado por classe."""
    if thresholds is None:
        return logits.argmax(dim=1)
    
    probs = F.softmax(logits, dim=1)
    preds = []
    for prob in probs:
        # Para cada amostra, verificar se alguma classe minorit√°ria passa o threshold
        pred = prob.argmax().item()
        for cls, thresh in thresholds.items():
            if prob[cls].item() >= thresh and thresh < 0.5:
                if prob[cls].item() > prob[pred].item() * 0.7:  # Se est√° pr√≥ximo do m√°ximo
                    pred = cls
                    break
        preds.append(pred)
    return preds

In [None]:
# ==== TREINAMENTO ====
print("[6/8] Treinando modelo...")

def evaluate(model, loader, use_threshold=False):
    model.eval()
    all_preds, all_labels, all_logits = [], [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            all_logits.append(outputs.logits.cpu())
            if 'labels' in batch:
                all_labels.extend(batch['labels'].numpy())
    
    all_logits = torch.cat(all_logits, dim=0)
    
    if use_threshold and USE_THRESHOLD_TUNING:
        all_preds = predict_with_threshold(all_logits, CLASS_THRESHOLDS)
    else:
        all_preds = all_logits.argmax(dim=1).numpy()
    
    if all_labels:
        return f1_score(all_labels, all_preds, average='macro'), all_preds
    return all_preds

best_f1 = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    
    for step, batch in enumerate(pbar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels) / GRADIENT_ACCUMULATION
        
        loss.backward()
        total_loss += loss.item() * GRADIENT_ACCUMULATION
        
        if (step + 1) % GRADIENT_ACCUMULATION == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        pbar.set_postfix({'loss': f'{loss.item() * GRADIENT_ACCUMULATION:.4f}'})
    
    # Avaliar com e sem threshold
    val_f1, _ = evaluate(model, val_loader, use_threshold=False)
    val_f1_thresh, _ = evaluate(model, val_loader, use_threshold=True)
    print(f'Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Val F1={val_f1:.4f}, F1+Thresh={val_f1_thresh:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
        print(f'  -> Melhor modelo salvo! F1={best_f1:.4f}')

print(f'\nMelhor F1 valida√ß√£o: {best_f1:.4f}')

In [None]:
# ==== PREDI√á√ÉO ====
print("[7/8] Gerando predi√ß√µes...")
model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))
predictions = evaluate(model, test_loader, use_threshold=USE_THRESHOLD_TUNING)
print(f'Predi√ß√µes geradas: {len(predictions)}')

In [None]:
# ==== SUBMISS√ÉO ====
print("[8/8] Criando submiss√£o...")
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print('‚úÖ Submiss√£o salva!')
print(f'\nDistribui√ß√£o das predi√ß√µes:')
print(submission['target'].value_counts().sort_index())