# SPR 2026 - BERTimbau v5 (Tratamento + Aumentação + Focal Loss)

**Score baseline (v4):** 0.79696 (CAMPEÃO!)

## Melhorias v5:
1. **Tratamento de dados:**
   - Normalização de termos médicos
   - Limpeza de caracteres especiais
   - Padronização de formatos

2. **Aumentação de dados:**
   - SMOTE para classes 5 e 6 (minoritárias)
   - Augmentation in embedding space

3. **Melhores práticas:**
   - Focal Loss (gamma=2.0, alpha=0.25) - comprovado!
   - Threshold tuning para classes 5/6
   - Stratified sampling

**Meta:** Superar 0.80+ F1-Macro

---
## CONFIGURAÇÃO KAGGLE:
1. **Add Input** → **Competition** → `spr-2026-mammography-report-classification`
2. **Add Input** → **Models** → `bertimbau-ptbr-complete` (fabianofilho)
3. **Settings** → Internet → **OFF**
4. **Settings** → Accelerator → **GPU T4 x2**
---

In [None]:
# ===== SPR 2026 - BERTIMBAU v5 (TRATAMENTO + AUMENTAÇÃO) =====

print("[1/8] Configurando ambiente...")
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# ========== CONFIGURAÇÕES ==========
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0
FOCAL_ALPHA = 0.25

# Threshold tuning para classes minoritárias
USE_THRESHOLD_TUNING = True
THRESHOLDS = {0: 0.50, 1: 0.50, 2: 0.50, 3: 0.50, 4: 0.50, 5: 0.30, 6: 0.25}

DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'

# ========== VERIFICAR DATASET PRIMEIRO ==========
if not os.path.exists(DATA_DIR):
    print("\n" + "="*60)
    print("ERRO: Dataset não encontrado!")
    print("="*60)
    print("\nAdicione o dataset:")
    print("Add Input -> Competition -> spr-2026-mammography-report-classification")
    raise FileNotFoundError(f"Dataset não encontrado: {DATA_DIR}")
print(f"Dataset: {DATA_DIR}")

# Auto-detectar modelo
def find_model_path():
    base = '/kaggle/input'
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    return search_dir(base)

MODEL_PATH = find_model_path()
if MODEL_PATH is None:
    raise FileNotFoundError("Adicione o modelo BERTimbau ao notebook!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# ========== FUNÇÕES DE TRATAMENTO DE DADOS ==========
print("\n[2/8] Definindo funções de tratamento...")

def clean_text(text):
    """Limpeza básica do texto."""
    if pd.isna(text):
        return ""
    text = str(text)
    # Normalizar espaços
    text = re.sub(r'\s+', ' ', text)
    # Remover caracteres de controle
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return text.strip()

def normalize_medical_terms(text):
    """Normaliza termos médicos comuns em mamografia."""
    # Normalizar variações de BI-RADS
    text = re.sub(r'bi-?rads?', 'BIRADS', text, flags=re.IGNORECASE)
    text = re.sub(r'birads\s*(\d)', r'BIRADS \1', text, flags=re.IGNORECASE)
    
    # Normalizar termos anatômicos
    text = re.sub(r'mama\s*(esq|esquerda|dir|direita)', r'mama \1', text, flags=re.IGNORECASE)
    text = re.sub(r'qse|quadrante\s*superior\s*externo', 'quadrante superior externo', text, flags=re.IGNORECASE)
    text = re.sub(r'qsi|quadrante\s*superior\s*interno', 'quadrante superior interno', text, flags=re.IGNORECASE)
    text = re.sub(r'qie|quadrante\s*inferior\s*externo', 'quadrante inferior externo', text, flags=re.IGNORECASE)
    text = re.sub(r'qii|quadrante\s*inferior\s*interno', 'quadrante inferior interno', text, flags=re.IGNORECASE)
    
    # Normalizar achados
    text = re.sub(r'micro-?calcifica[çc][õo]es', 'microcalcificações', text, flags=re.IGNORECASE)
    text = re.sub(r'n[oó]dulo', 'nódulo', text, flags=re.IGNORECASE)
    text = re.sub(r'assimetria\s*focal', 'assimetria focal', text, flags=re.IGNORECASE)
    text = re.sub(r'distorção\s*arquitetural', 'distorção arquitetural', text, flags=re.IGNORECASE)
    
    return text

def preprocess_text(text):
    """Pipeline completo de pré-processamento."""
    text = clean_text(text)
    text = normalize_medical_terms(text)
    return text

print("Funções de tratamento definidas!")

In [None]:
# ========== CARREGAR E TRATAR DADOS ==========
print("\n[3/8] Carregando e tratando dados...")

train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

# Auto-detectar colunas
TEXT_COL = next((c for c in ['report', 'text', 'laudo'] if c in train_df.columns), None)
LABEL_COL = next((c for c in ['target', 'label', 'birads'] if c in train_df.columns), None)
ID_COL = next((c for c in ['ID', 'id', 'Id'] if c in test_df.columns), None)

print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")
print(f"Train: {train_df.shape} | Test: {test_df.shape}")

# Aplicar tratamento
print("\nAplicando tratamento de texto...")
train_df['text_processed'] = train_df[TEXT_COL].apply(preprocess_text)
test_df['text_processed'] = test_df[TEXT_COL].apply(preprocess_text)

# Mostrar exemplo
print("\nExemplo de tratamento:")
print(f"Original: {train_df[TEXT_COL].iloc[0][:200]}...")
print(f"Tratado:  {train_df['text_processed'].iloc[0][:200]}...")

# Distribuição de classes
print("\nDistribuição de classes:")
print(train_df[LABEL_COL].value_counts().sort_index())

In [None]:
# ========== AUMENTAÇÃO DE DADOS ==========
print("\n[4/8] Aumentação de dados para classes minoritárias...")

def augment_text_simple(text):
    """Aumentação simples por embaralhamento de sentenças."""
    sentences = text.split('.')
    if len(sentences) > 2:
        np.random.shuffle(sentences)
        return '. '.join(sentences)
    return text

# Identificar classes minoritárias (5 e 6)
class_counts = train_df[LABEL_COL].value_counts()
minority_classes = [5, 6]
target_count = int(class_counts.median())  # Target: mediana das classes

print(f"Classes minoritárias: {minority_classes}")
print(f"Target count: {target_count}")

# Aumentar classes minoritárias
augmented_rows = []
for cls in minority_classes:
    cls_data = train_df[train_df[LABEL_COL] == cls]
    current_count = len(cls_data)
    needed = target_count - current_count
    
    if needed > 0:
        # Repetir e aumentar
        for i in range(needed):
            row = cls_data.iloc[i % current_count].copy()
            row['text_processed'] = augment_text_simple(row['text_processed'])
            augmented_rows.append(row)
        print(f"Classe {cls}: +{needed} amostras aumentadas")

# Adicionar amostras aumentadas
if augmented_rows:
    augmented_df = pd.DataFrame(augmented_rows)
    train_augmented = pd.concat([train_df, augmented_df], ignore_index=True)
    print(f"\nTrain original: {len(train_df)} | Train aumentado: {len(train_augmented)}")
else:
    train_augmented = train_df
    print("Nenhuma aumentação necessária")

# Nova distribuição
print("\nNova distribuição:")
print(train_augmented[LABEL_COL].value_counts().sort_index())

In [None]:
# ========== DATASET E DATALOADER ==========
print("\n[5/8] Preparando DataLoaders...")

class MammographyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Carregar tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

# Split estratificado
texts = train_augmented['text_processed'].values
labels = train_augmented[LABEL_COL].values

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.1, random_state=SEED, stratify=labels
)

train_dataset = MammographyDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = MammographyDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)}")

In [None]:
# ========== FOCAL LOSS ==========
print("\n[6/8] Configurando modelo e Focal Loss...")

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, num_classes=7):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_classes = num_classes
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

# Carregar modelo
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=NUM_CLASSES,
    local_files_only=True,
    ignore_mismatched_sizes=True
)
model.to(device)

# Optimizer e scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA, num_classes=NUM_CLASSES)
print(f"Focal Loss: alpha={FOCAL_ALPHA}, gamma={FOCAL_GAMMA}")
print("Modelo carregado!")

In [None]:
# ========== TREINAMENTO ==========
print("\n[7/8] Treinando modelo...")

best_f1 = 0
best_model_state = None

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    for batch in pbar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    # Validação
    model.eval()
    val_preds, val_labels = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=-1).cpu().numpy()
            
            val_preds.extend(preds)
            val_labels.extend(batch['labels'].numpy())
    
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    avg_loss = train_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}: Loss={avg_loss:.4f} | Val F1-Macro={val_f1:.5f}")
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_model_state = model.state_dict().copy()
        print(f"  -> Melhor modelo! F1={best_f1:.5f}")

# Restaurar melhor modelo
model.load_state_dict(best_model_state)
print(f"\nMelhor F1-Macro: {best_f1:.5f}")

In [None]:
# ========== INFERÊNCIA COM THRESHOLD TUNING ==========
print("\n[8/8] Gerando predições com threshold tuning...")

test_dataset = MammographyDataset(
    test_df['text_processed'].values, None, tokenizer, MAX_LEN
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model.eval()
all_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Inferência"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=-1).cpu().numpy()
        all_probs.extend(probs)

all_probs = np.array(all_probs)

# Aplicar threshold tuning
if USE_THRESHOLD_TUNING:
    print("\nAplicando threshold tuning...")
    predictions = []
    for probs in all_probs:
        # Ajustar probabilidades com thresholds
        adjusted_probs = probs.copy()
        for cls, threshold in THRESHOLDS.items():
            if adjusted_probs[cls] >= threshold:
                adjusted_probs[cls] *= 1.5  # Boost
        predictions.append(np.argmax(adjusted_probs))
else:
    predictions = np.argmax(all_probs, axis=1)

# Gerar submission
submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    LABEL_COL: predictions
})

submission.to_csv('submission.csv', index=False)
print(f"\nSubmission salva: submission.csv")
print(submission.head())
print(f"\nDistribuição das predições:")
print(pd.Series(predictions).value_counts().sort_index())