# SPR 2026 - ClinicalBERT (Classificação Direta)

**Modelo:** medicalai/ClinicalBERT

**Características:**
- Modelo BERT encoder adaptado para texto clínico
- Classificação direta (não generativa)
- Mais eficiente para classificação de categorias fixas
- Fine-tuning similar ao BERTimbau

**Abordagem:** Fine-tuning para classificação (7 classes)

---
## CONFIGURAÇÃO KAGGLE:
1. **Add Input** → **Models** → `clinicalbert` ou similar
2. **Add Input** → **Competition** → `spr-2026-mammography-report-classification`
3. **Settings** → Internet → **OFF**, GPU → **T4 x2**
---

In [None]:
# ===== CLINICALBERT - CLASSIFICAÇÃO DIRETA =====

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - ClinicalBERT (Fine-tuning)")
print("="*60)

# ===== CONFIG =====
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
NUM_CLASSES = 7
FOCAL_GAMMA = 2.0
FOCAL_ALPHA = 0.25
THRESHOLDS = {0: 0.50, 1: 0.50, 2: 0.50, 3: 0.50, 4: 0.50, 5: 0.30, 6: 0.25}

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(SEED)
np.random.seed(SEED)

def find_model_path():
    base = '/kaggle/input'
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth: return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json')):
                    return path
                result = search_dir(path, depth + 1, max_depth) if os.path.isdir(path) else None
                if result: return result
        except: pass
        return None
    return search_dir(base)

MODEL_PATH = find_model_path()
print(f"Device: {device} | Model: {MODEL_PATH}")

In [None]:
# ===== FOCAL LOSS =====
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha, self.gamma = alpha, gamma
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        return (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()

criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA)

In [None]:
# ===== DATASET =====
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(str(self.texts[idx]), truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        item = {'input_ids': enc['input_ids'].squeeze(), 'attention_mask': enc['attention_mask'].squeeze()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].values, train_df['target'].values,
    test_size=0.1, random_state=SEED, stratify=train_df['target']
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_ds = TextDataset(test_df['report'].values, None, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

In [None]:
# ===== TREINAR =====
print("\nTreinando ClinicalBERT...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_CLASSES, local_files_only=True)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*EPOCHS)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
        loss = criterion(outputs.logits, batch['labels'].to(device))
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"  Loss: {total_loss/len(train_loader):.4f}")

In [None]:
# ===== AVALIAR =====
def apply_thresholds(probs, thresholds):
    preds = []
    for i in range(len(probs)):
        pred = np.argmax(probs[i])
        for c in [6, 5]:
            if probs[i, c] >= thresholds[c]:
                pred = c
                break
        preds.append(pred)
    return np.array(preds)

model.eval()
val_probs = []
with torch.no_grad():
    for batch in val_loader:
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
        val_probs.append(F.softmax(outputs.logits, dim=-1).cpu().numpy())

val_probs = np.vstack(val_probs)

baseline_preds = np.argmax(val_probs, axis=1)
tuned_preds = apply_thresholds(val_probs, THRESHOLDS)

print(f"Baseline F1: {f1_score(val_labels, baseline_preds, average='macro'):.5f}")
print(f"Threshold F1: {f1_score(val_labels, tuned_preds, average='macro'):.5f}")

In [None]:
# ===== SUBMISSION =====
test_probs = []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
        test_probs.append(F.softmax(outputs.logits, dim=-1).cpu().numpy())

test_probs = np.vstack(test_probs)
predictions = apply_thresholds(test_probs, THRESHOLDS)

submission = pd.DataFrame({'ID': test_df['ID'], 'target': predictions})
submission.to_csv('submission.csv', index=False)
print(f"\nSubmission salva!")
print(submission['target'].value_counts().sort_index())

## Comparação: ClinicalBERT vs BERTimbau

| Aspecto | ClinicalBERT | BERTimbau |
|---------|--------------|-----------|
| Idioma | Inglês (clínico) | Português |
| Domínio | Textos clínicos | Geral PT-BR |
| Arquitetura | BERT encoder | BERT encoder |
| Vantagem | Terminologia médica inglês | Língua nativa dos laudos |

**Hipótese:** BERTimbau pode ter vantagem por entender português, mas ClinicalBERT conhece terminologia médica.