# SPR 2026 - mDeBERTa V3

**mDeBERTa-v3-base multilingual**

- ‚úÖ Disentangled attention
- ‚úÖ Early stopping
- ‚úÖ Tempo esperado: ~15-20 min

---
## üì• MODELO - Baixar com: `models/download_mdeberta.ipynb`

**Op√ß√£o 1 - Kaggle Datasets (mais f√°cil):**
- Add Input ‚Üí Datasets ‚Üí buscar `mdeberta_v3_base` (Jonathan Chan)

**Op√ß√£o 2 - Notebook de download:**
1. Rode `models/download_mdeberta.ipynb` no Kaggle com Internet ON
2. Save Version -> Save & Run All
3. Neste notebook: Add Input -> Your Work -> `download-mdeberta`

---
**CONFIGURA√á√ÉO KAGGLE:**
1. Settings ‚Üí Internet ‚Üí **OFF**
2. Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**

---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# Paths poss√≠veis para mDeBERTa (datasets do Kaggle)
MODEL_PATHS = [
    '/kaggle/input/mdeberta-v3-base',                    # Jonathan Chan
    '/kaggle/input/mdeberta_v3_base',                    
    '/kaggle/input/huggingface-deberta',                 # Wilmer E. Henao
    '/kaggle/input/mdebertav3base',                      # Wojciech "Victor" Fulmyk
    '/kaggle/input/microsoft-mdeberta-v3-base',
]

MODEL_PATH = None
for path in MODEL_PATHS:
    if os.path.exists(path):
        MODEL_PATH = path
        break

if MODEL_PATH is None:
    print("\n‚ö†Ô∏è mDeBERTa n√£o encontrado. Datasets dispon√≠veis:")
    for item in os.listdir('/kaggle/input'):
        print(f"  - {item}")
    raise FileNotFoundError("Adicione um dataset com mDeBERTa-v3-base ao notebook!")

MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 5
LR = 1e-5
PATIENCE = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')

np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Carregar dados
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')

In [None]:
# Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print('Tokenizer carregado!')

In [None]:
# Split treino/valida√ß√£o
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.1,
    stratify=train_df['target'],
    random_state=SEED
)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)

print(f'Train: {len(train_ds)}, Val: {len(val_ds)}')

In [None]:
# Treinar modelo
def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    return {'f1_macro': f1_score(eval_pred.label_ids, preds, average='macro')}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=7)

args = TrainingArguments(
    output_dir='/kaggle/working/model',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    report_to='none',
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
)

trainer.train()

In [None]:
# Submiss√£o
test_ds = TextDataset(test_df['report'].tolist(), None, tokenizer, MAX_LENGTH)
test_preds = trainer.predict(test_ds)
predictions = np.argmax(test_preds.predictions, axis=1)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())