# SPR 2026 - BERTimbau + Focal Loss - Treated v1

**Texto tratado: Stop words + Lowercase**

Base: BERTimbau + Focal Loss (0.79696)
Tratamento: Remo√ß√£o de stop words + lowercase

---
## üì• MODELO - `models/download_bertimbau.ipynb`

**Kaggle Models:** Add Input ‚Üí Models ‚Üí `bertimbau-ptbr-complete` (fabianofilho)

---
**CONFIGURA√á√ÉO KAGGLE:**
1. Settings ‚Üí Internet ‚Üí **OFF**
2. Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**

---

In [None]:
# ===== SETUP E IMPORTS =====
print("[1/9] Configurando ambiente...")
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

SEED = 42
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0
FOCAL_ALPHA = 0.25

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# ==== AUTO-DETECTAR PATH DO MODELO ====
def find_model_path():
    base = '/kaggle/input'
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    return search_dir(base)

MODEL_PATH = find_model_path()
if MODEL_PATH is None:
    raise FileNotFoundError("Adicione o modelo BERTimbau ao notebook!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
# ==== PREPROCESSAMENTO DE TEXTO ====
print("[2/9] Definindo preprocessamento...")

# Stop words em portugu√™s (customizada para √°rea m√©dica)
STOP_WORDS = {
    'a', 'o', 'e', 'de', 'da', 'do', 'em', 'um', 'uma', 'para', 'com', 'n√£o',
    'na', 'no', 'os', 'as', 'que', 'se', 'por', 'mais', 'como', 'mas', 'foi',
    'ao', 'ele', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'ter', 'seus',
    'quem', 'nas', 'me', 'esse', 'eles', 'est√£o', 'voc√™', 'tinha', 'foram',
    'essa', 'num', 'nem', 'suas', 'meu', '√†s', 'minha', 't√™m', 'numa', 'pelos',
    'elas', 'havia', 'seja', 'qual', 'ser√°', 'n√≥s', 'tenho', 'lhe', 'deles',
    'essas', 'esses', 'pelas', 'este', 'fosse', 'dele', 'tu', 'te', 'voc√™s',
    'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso',
    'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas',
    'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou',
    'est√°', 'estamos', 'est√£o', 'estive', 'esteve', 'estivemos', 'estiveram',
    'estava', 'est√°vamos', 'estavam', 'estivera', 'estiv√©ramos', 'esteja',
    'haja', 'hajamos', 'hajam', 'houve', 'houvemos', 'houveram', 'houvera',
    'houver√°', 'haveria', 'houvesse', 'houv√©ssemos', 'houvessem', 'houver',
    'houvermos', 'houverem', 'houverei', 'houveremos', 'houveriam', 'sou',
    'somos', 's√£o', 'era', '√©ramos', 'eram', 'fui', 'fomos', 'seremos',
    'seria', 'seriam', 'fosse', 'f√¥ssemos', 'fossem', 'for', 'formos', 'forem',
    'serei', 'ser√≠amos', 'tenha', 'tenhamos', 'tenham', 'teve', 'tivemos',
    'tiveram', 'tivera', 'tiv√©ramos', 'tenha', 'tenhamos', 'tenham', 'tiver',
    'tivermos', 'tiverem', 'tivesse', 'tiv√©ssemos', 'tivessem', 'ter√°',
    # Palavras comuns em laudos mas n√£o relevantes para classifica√ß√£o
    'laudo', 'exame', 'paciente', 'data', 'realizado', 'realizada', 'm√©dico',
    'dr', 'dra', 'solicitante', 'hospital', 'cl√≠nica', 'mamografia', 'bilateral'
}

# N√ÉO REMOVER: termos m√©dicos importantes
KEEP_WORDS = {
    'birads', 'bi-rads', 'calcifica√ß√£o', 'calcifica√ß√µes', 'n√≥dulo', 'n√≥dulos',
    'massa', 'massas', 'assimetria', 'assimetrias', 'benigno', 'benigna',
    'maligno', 'maligna', 'suspeito', 'suspeita', 'at√≠pico', 'at√≠pica',
    'microcalcifica√ß√£o', 'microcalcifica√ß√µes', 'densidade', 'espiculado',
    'irregular', 'circunscrito', 'obscurecido', 'linear', 'segmentar',
    'regional', 'difuso', 'heterog√™neo', 'homog√™neo', 'categoria', 'achado',
    'negativo', 'positivo', 'provavelmente', 'altamente', 'sugestivo'
}

def preprocess_text(text):
    """Remove stop words e converte para lowercase."""
    if pd.isna(text):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Normalizar BI-RADS
    text = re.sub(r'bi[\-\s]?rads?', 'birads', text, flags=re.IGNORECASE)
    
    # Remover caracteres especiais (manter letras, n√∫meros, espa√ßos)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenizar
    words = text.split()
    
    # Filtrar stop words (mas manter termos m√©dicos importantes)
    filtered = []
    for word in words:
        if word in KEEP_WORDS:
            filtered.append(word)
        elif word not in STOP_WORDS and len(word) > 1:
            filtered.append(word)
    
    return ' '.join(filtered)

# Teste
sample = "A paciente realizou exame de mamografia bilateral. BI-RADS categoria 2, achado benigno."
print(f"Original: {sample}")
print(f"Tratado:  {preprocess_text(sample)}")

In [None]:
# ==== FOCAL LOSS ====
print("[3/9] Definindo Focal Loss...")

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA)
print(f'Focal Loss: gamma={FOCAL_GAMMA}, alpha={FOCAL_ALPHA}')

In [None]:
# ==== CARREGAR E PREPROCESSAR DADOS ====
print("[4/9] Carregando e preprocessando dados...")
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

# Aplicar preprocessamento
train_df['report_treated'] = train_df['report'].apply(preprocess_text)
test_df['report_treated'] = test_df['report'].apply(preprocess_text)

print(f'Train: {train_df.shape}, Test: {test_df.shape}')
print(f'\nExemplo original:\n{train_df["report"].iloc[0][:200]}...')
print(f'\nExemplo tratado:\n{train_df["report_treated"].iloc[0][:200]}...')

In [None]:
# ==== DATASET CLASS ====
print("[5/9] Preparando dataset...")

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print('Tokenizer carregado!')

# Split - usando texto TRATADO
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report_treated'].values, train_df['target'].values,
    test_size=0.1, random_state=SEED, stratify=train_df['target']
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_ds = TextDataset(test_df['report_treated'].values, None, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f'Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}')

In [None]:
# ==== MODELO ====
print("[6/9] Carregando modelo...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=NUM_CLASSES,
    local_files_only=True
)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

print(f'Modelo carregado! Parametros: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# ==== TREINAMENTO ====
print("[7/9] Treinando modelo...")

def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
            if 'labels' in batch:
                labels.extend(batch['labels'].numpy())
    if labels:
        return f1_score(labels, preds, average='macro')
    return preds

best_f1 = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    val_f1 = evaluate(model, val_loader)
    print(f'Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Val F1={val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
        print(f'  -> Melhor modelo salvo! F1={best_f1:.4f}')

print(f'\nMelhor F1 valida√ß√£o: {best_f1:.4f}')

In [None]:
# ==== PREDI√á√ÉO ====
print("[8/9] Gerando predi√ß√µes...")
model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))
predictions = evaluate(model, test_loader)
print(f'Predi√ß√µes geradas: {len(predictions)}')

In [None]:
# ==== SUBMISS√ÉO ====
print("[9/9] Criando submiss√£o...")
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print('‚úÖ Submiss√£o salva!')
print(f'\nDistribui√ß√£o das predi√ß√µes:')
print(submission['target'].value_counts().sort_index())