# SPR 2026 - BERTimbau Ultimate v6

**Estratégia Final** combinando: BERTimbau + Focal Loss + 3-Seed Ensemble + 5-Fold CV + Grid Search Thresholds

---
**CONFIGURAÇÃO KAGGLE:**
1. Add Input > Models > `bertimbau-ptbr-complete`
2. Add Input > Competition > `spr-2026-mammography-report-classification`
3. Settings > Internet OFF, GPU T4 x2
---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print('='*60)
print('SPR 2026 - BERTimbau Ultimate v6')
print('='*60)

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0
FOCAL_ALPHA = 0.25
SEEDS = [42, 123, 456]
N_FOLDS = 5

DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def find_model_path():
    base = '/kaggle/input'
    def search(d, depth=0):
        if depth > 10: return None
        try:
            for item in os.listdir(d):
                path = os.path.join(d, item)
                if os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json')): return path
                r = search(path, depth+1) if os.path.isdir(path) else None
                if r: return r
        except: pass
        return None
    return search(base)

MODEL_PATH = find_model_path()
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha, self.gamma = alpha, gamma
    def forward(self, inputs, targets):
        ce = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce)
        return (self.alpha * (1 - pt) ** self.gamma * ce).mean()

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(str(self.texts[idx]), truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        item = {'input_ids': enc['input_ids'].squeeze(), 'attention_mask': enc['attention_mask'].squeeze()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print(f'Train: {len(train_df)}, Test: {len(test_df)}')
print(train_df['target'].value_counts().sort_index())

In [None]:
def find_thresholds(probs, labels):
    thresholds = {}
    for c in range(NUM_CLASSES):
        best_t, best_f1 = 0.5, 0
        for t in np.arange(0.1, 0.9, 0.02):
            preds = np.argmax(probs, axis=1)
            mask = (probs[:, c] >= t) & (probs[:, c] == probs.max(axis=1))
            preds[mask] = c
            f1 = f1_score(labels, preds, average='macro')
            if f1 > best_f1: best_f1, best_t = f1, t
        thresholds[c] = round(best_t, 2)
    return thresholds

def apply_thresholds(probs, thresholds):
    preds = []
    for i in range(len(probs)):
        pred = np.argmax(probs[i])
        for c in sorted(thresholds.keys(), reverse=True):
            if probs[i, c] >= thresholds[c] and probs[i, c] > probs[i, pred] * 0.85:
                pred = c
                break
        preds.append(pred)
    return np.array(preds)

In [None]:
criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA)
all_oof, all_test, all_thresh = [], [], []

for seed in SEEDS:
    print(f'\n{"="*60}\nSEED {seed}\n{"="*60}')
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    oof = np.zeros((len(train_df), NUM_CLASSES))
    test_probs = np.zeros((len(test_df), NUM_CLASSES))
    fold_thresh = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['report'], train_df['target'])):
        print(f'Fold {fold+1}/{N_FOLDS}')
        train_ds = TextDataset(train_df['report'].iloc[train_idx].values, train_df['target'].iloc[train_idx].values, tokenizer, MAX_LEN)
        val_ds = TextDataset(train_df['report'].iloc[val_idx].values, train_df['target'].iloc[val_idx].values, tokenizer, MAX_LEN)
        test_ds = TextDataset(test_df['report'].values, None, tokenizer, MAX_LEN)
        
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
        test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)
        
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_CLASSES, local_files_only=True).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
        scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader)*EPOCHS)
        
        for epoch in range(EPOCHS):
            model.train()
            for batch in train_loader:
                optimizer.zero_grad()
                out = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
                loss = criterion(out.logits, batch['labels'].to(device))
                loss.backward()
                optimizer.step()
                scheduler.step()
        
        model.eval()
        val_p = []
        with torch.no_grad():
            for batch in val_loader:
                out = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
                val_p.append(F.softmax(out.logits, dim=-1).cpu().numpy())
        val_p = np.vstack(val_p)
        oof[val_idx] = val_p
        
        ft = find_thresholds(val_p, train_df['target'].iloc[val_idx].values)
        fold_thresh.append(ft)
        
        test_p = []
        with torch.no_grad():
            for batch in test_loader:
                out = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
                test_p.append(F.softmax(out.logits, dim=-1).cpu().numpy())
        test_probs += np.vstack(test_p) / N_FOLDS
        
        print(f'  F1: {f1_score(train_df["target"].iloc[val_idx].values, np.argmax(val_p, axis=1), average="macro"):.5f}')
    
    avg_t = {c: round(np.mean([ft[c] for ft in fold_thresh]), 2) for c in range(NUM_CLASSES)}
    all_oof.append(oof)
    all_test.append(test_probs)
    all_thresh.append(avg_t)
    print(f'Seed {seed} OOF F1: {f1_score(train_df["target"].values, apply_thresholds(oof, avg_t), average="macro"):.5f}')

In [None]:
print('\n' + '='*60 + '\nENSEMBLE FINAL\n' + '='*60)
final_oof = np.mean(all_oof, axis=0)
final_test = np.mean(all_test, axis=0)
final_t = {c: round(np.mean([t[c] for t in all_thresh]), 2) for c in range(NUM_CLASSES)}
print(f'Thresholds: {final_t}')

base_f1 = f1_score(train_df['target'].values, np.argmax(final_oof, axis=1), average='macro')
tuned_f1 = f1_score(train_df['target'].values, apply_thresholds(final_oof, final_t), average='macro')
print(f'Baseline: {base_f1:.5f}')
print(f'Tuned:    {tuned_f1:.5f}')

In [None]:
preds = apply_thresholds(final_test, final_t)
submission = pd.DataFrame({'ID': test_df['ID'], 'target': preds})
submission.to_csv('submission.csv', index=False)
print('Submission salva!')
print(submission['target'].value_counts().sort_index())