# SPR 2026 - BERT Multilingual (Baseline)

**BERT Base Multilingual Cased - Suporta Português**

- ✅ Modelo multilíngue (104 idiomas)
- ✅ Disponível no Kaggle Models
- ✅ Tempo esperado: ~15-20 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2**
3. Add Input → **Models** → Buscar: `bert-base-multilingual`
   - Autor: **google**
   - Selecionar: **bert-base-multilingual-cased**
4. **IMPORTANTE:** Execute "Run All" após commit

**Referência:** https://huggingface.co/google-bert/bert-base-multilingual-cased

---

In [None]:
# ===== SPR 2026 - BERT MULTILINGUAL (BASELINE) =====

# ==== SETUP E IMPORTS ====
print("[1/6] Configurando ambiente...")
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# ==== AUTO-DETECTAR PATH DO MODELO (BUSCA RECURSIVA) ====
def find_model_path():
    base = '/kaggle/input'
    
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    
    def search_dir(directory, depth=0, max_depth=6):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    
    return search_dir(base)

MODEL_PATH = find_model_path()

if MODEL_PATH is None:
    print("\n⚠️ Modelo não encontrado. Estrutura de /kaggle/input:")
    def show_tree(path, prefix="", depth=0):
        if depth > 5:
            return
        try:
            items = os.listdir(path)[:10]
            for item in items:
                full = os.path.join(path, item)
                print(f"{prefix}{item}")
                if os.path.isdir(full):
                    show_tree(full, prefix + "  ", depth + 1)
        except:
            pass
    show_tree('/kaggle/input')
    raise FileNotFoundError("Adicione o modelo bert-base-multilingual-cased ao notebook!")

MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# ==== CARREGAR DADOS ====
print("[2/6] Carregando dados...")
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'Train: {train_df.shape}, Test: {test_df.shape}')

In [None]:
# ==== DATASET CLASS ====
print("[3/6] Preparando dataset...")

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Carregar tokenizer (local_files_only=True para forçar offline)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print('Tokenizer carregado!')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.1,
    stratify=train_df['target'],
    random_state=SEED
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
print(f'Train: {len(train_ds)}, Val: {len(val_ds)}')

In [None]:
# ==== TREINAMENTO ====
print("[4/6] Treinando modelo...")

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    return {'f1_macro': f1_score(eval_pred.label_ids, preds, average='macro')}

# Carregar modelo (local_files_only=True para forçar offline)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH, 
    num_labels=7,
    local_files_only=True
)

args = TrainingArguments(
    output_dir='/kaggle/working/model',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    report_to='none',
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# ==== PREDIÇÃO ====
print("[5/6] Gerando predições...")
test_ds = TextDataset(test_df['report'].tolist(), None, tokenizer, MAX_LENGTH)
test_preds = trainer.predict(test_ds)
predictions = np.argmax(test_preds.predictions, axis=1)

In [None]:
# ==== SUBMISSÃO ====
print("[6/6] Criando submissão...")
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})
submission.to_csv('submission.csv', index=False)

print("="*60)
print("✅ CONCLUÍDO - submission.csv criado!")
print("="*60)
print(submission['target'].value_counts().sort_index())