# SPR 2026 - BERTimbau (BERT Português)

Fine-tuning do BERTimbau para classificação BI-RADS.

**Modelo:** neuralmind/bert-base-portuguese-cased

**Formato:** Code Competition (Kaggle) / Google Colab

In [11]:
# ============================================================
# SETUP - Ambiente e Dados
# ============================================================
import os
import sys

# Verificar Colab PRIMEIRO (mais confiável)
IS_COLAB = 'google.colab' in sys.modules
IS_KAGGLE = os.path.exists('/kaggle/input') and not IS_COLAB

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BASE = '/content/drive/MyDrive/SPR_2026_outputs'
    DATA_DIR = f'{DRIVE_BASE}/data'
    OUTPUT_DIR = DRIVE_BASE
    
    # Verificar se dados existem no Drive
    if not os.path.exists(f'{DATA_DIR}/train.csv'):
        print("⚠️ Dados não encontrados no Drive!")
        print("Execute primeiro o notebook 00_download_data.ipynb")
        raise FileNotFoundError(f"Arquivo não encontrado: {DATA_DIR}/train.csv")
elif IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

Ambiente: Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATA_DIR: /content/drive/MyDrive/SPR_2026_outputs/data
OUTPUT_DIR: /content/drive/MyDrive/SPR_2026_outputs


In [12]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'
MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5
N_FOLDS = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

Device: cpu


## 1. Carregar Dados

In [13]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train: {train.shape}")

test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test = pd.read_csv(test_path)
    print(f"Test: {test.shape}")
    assert set(['ID', 'report']).issubset(test.columns)
else:
    test = None
    print("test.csv não disponível - será carregado no runtime Kaggle")

Train: (18272, 3)
Test: (4, 2)


## 2. Dataset

In [14]:
class MammographyDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## 3. Treinar com K-Fold

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'f1_macro': f1_score(labels, predictions, average='macro')}

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(train), 7))
fold_scores = []
models = []  # Guardar modelos de cada fold

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    print(f"\n{'='*50}\nFold {fold+1}/{N_FOLDS}\n{'='*50}")
    
    train_ds = MammographyDataset(
        train.iloc[train_idx]['report'].tolist(),
        train.iloc[train_idx]['target'].tolist(),
        tokenizer, MAX_LENGTH
    )
    val_ds = MammographyDataset(
        train.iloc[val_idx]['report'].tolist(),
        train.iloc[val_idx]['target'].tolist(),
        tokenizer, MAX_LENGTH
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7)
    model.to(device)  # Mover modelo para GPU
    
    args = TrainingArguments(
        output_dir=f'{OUTPUT_DIR}/bertimbau_fold_{fold}',
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE*2,
        learning_rate=LR,
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=torch.cuda.is_available(),
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        report_to='none',
        seed=SEED,
        no_cuda=False,  # Garantir uso de GPU
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    # Salvar modelo do fold
    model_path = f'{OUTPUT_DIR}/bertimbau_fold_{fold}/best_model'
    trainer.save_model(model_path)
    models.append(model_path)
    print(f"✅ Modelo salvo: {model_path}")
    
    # OOF predictions
    preds = trainer.predict(val_ds)
    oof_preds[val_idx] = preds.predictions
    fold_f1 = f1_score(train.iloc[val_idx]['target'], np.argmax(preds.predictions, axis=1), average='macro')
    fold_scores.append(fold_f1)
    print(f"Fold {fold+1} F1: {fold_f1:.4f}")
    
    # Liberar memória GPU entre folds
    del model, trainer
    torch.cuda.empty_cache()

# OOF Score final
oof_final = np.argmax(oof_preds, axis=1)
oof_f1 = f1_score(train['target'], oof_final, average='macro')
print(f"\n{'='*50}")
print(f"OOF F1-Macro (média folds): {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
print(f"OOF F1-Macro (predições): {oof_f1:.4f}")
print(f"{'='*50}")


Fold 1/5


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: neuralmind/bert-base-portuguese-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. C

Epoch,Training Loss,Validation Loss


## 4. Gerar Submissão

In [None]:
# ============================================================
# Geração de Submissão (Ensemble de N_FOLDS modelos)
# ============================================================
# Carregar test
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
test_ds = MammographyDataset(test['report'].tolist(), None, tokenizer, MAX_LENGTH)

# Ensemble: média das predições de todos os folds
all_preds = []

print("Gerando predições com ensemble...")
for fold, model_path in enumerate(models):
    print(f"  Fold {fold+1}/{N_FOLDS}...")
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to(device)
    model.eval()
    
    trainer = Trainer(model=model)
    preds = trainer.predict(test_ds)
    all_preds.append(preds.predictions)

# Média das probabilidades de todos os folds
avg_preds = np.mean(all_preds, axis=0)
predictions = np.argmax(avg_preds, axis=1)

# Criar submission
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})

# SEMPRE salvar submission.csv no diretório atual (exigido pelo Kaggle)
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv salvo no diretório atual")

# Também salvar no OUTPUT_DIR para persistência (Colab/Local)
if not IS_KAGGLE:
    submission_path = os.path.join(OUTPUT_DIR, 'submission_bertimbau.csv')
    submission.to_csv(submission_path, index=False)
    print(f"✅ Cópia salva em: {submission_path}")
    
    # Salvar OOF predictions para ensemble futuro
    oof_df = pd.DataFrame(oof_preds, columns=[f'pred_{i}' for i in range(7)])
    oof_df['target'] = train['target']
    oof_df.to_csv(os.path.join(OUTPUT_DIR, 'oof_bertimbau.csv'), index=False)
    print(f"✅ OOF predictions salvas")

print(f"\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())

In [None]:
# Download no Colab (opcional)
if IS_COLAB and os.path.exists('submission.csv'):
    from google.colab import files
    files.download('submission.csv')