# Valida√ß√£o: mDeBERTa-v3 (FP32 Fix)

**mDeBERTa-v3-base multilingual - NUNCA TESTADO CORRETAMENTE**

## ‚ö†Ô∏è Hist√≥rico
- Score 0.01008 em submiss√µes anteriores (BUG)
- Problema: fp16/mixed precision incompat√≠vel
- **Fix:** `model.float()` for√ßa TODOS os par√¢metros para fp32

## üéØ Objetivo
Testar m√∫ltiplas configura√ß√µes para encontrar a melhor performance.

## üìä Configura√ß√µes Testadas
1. **Loss:** CrossEntropy, Focal Loss (Œ≥=1,2,3)
2. **LR:** 1e-5, 2e-5, 3e-5
3. **Batch Size:** 8, 16
4. **Max Length:** 256, 512

---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Detectar ambiente
if os.path.exists('/kaggle/input'):
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    def find_model_path():
        base = '/kaggle/input'
        def search_dir(directory, depth=0, max_depth=10):
            if depth > max_depth: return None
            try:
                for item in os.listdir(directory):
                    path = os.path.join(directory, item)
                    if os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json')):
                        return path
                    result = search_dir(path, depth + 1, max_depth) if os.path.isdir(path) else None
                    if result: return result
            except: pass
            return None
        return search_dir(base)
    MODEL_PATH = find_model_path()
else:
    DATA_DIR = '../data'
    MODEL_PATH = 'microsoft/mdeberta-v3-base'  # Local fallback

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')

In [None]:
# ===== CARREGAR DADOS =====
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')

print(f'Total: {len(train_df)}')
print(f'\nDistribui√ß√£o targets:')
print(train_df['target'].value_counts().sort_index())

# Split 80/20 estratificado
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.2,
    stratify=train_df['target'],
    random_state=SEED
)

print(f'\nTrain: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# ===== DATASET =====
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
        )
        item = {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
# ===== FOCAL LOSS =====
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25, num_classes=7):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.num_classes = num_classes
        
    def forward(self, inputs, targets):
        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

# ===== TRAINER COM FOCAL LOSS =====
class FocalLossTrainer(Trainer):
    def __init__(self, focal_gamma=2.0, focal_alpha=0.25, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss = FocalLoss(gamma=focal_gamma, alpha=focal_alpha)
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# ===== FUN√á√ÉO DE TREINO =====
def train_and_evaluate(config, tokenizer=None):
    """Treina e avalia um modelo com a configura√ß√£o dada."""
    
    print(f"\n{'='*60}")
    print(f"Config: {config}")
    print(f"{'='*60}")
    
    # Dataset
    train_ds = TextDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_ds = TextDataset(val_texts, val_labels, tokenizer, config['max_length'])
    
    # Modelo
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_PATH, 
        num_labels=7,
        local_files_only=True
    )
    
    # ‚ö†Ô∏è FIX CR√çTICO: For√ßar fp32
    model = model.float()
    print(f'‚úÖ model.float() aplicado - dtype: {next(model.parameters()).dtype}')
    
    # M√©tricas
    def compute_metrics(eval_pred):
        preds = np.argmax(eval_pred.predictions, axis=1)
        f1 = f1_score(eval_pred.label_ids, preds, average='macro')
        f1_per_class = f1_score(eval_pred.label_ids, preds, average=None)
        return {
            'f1_macro': f1,
            **{f'f1_class_{i}': f1_per_class[i] for i in range(7)}
        }
    
    # Training args
    args = TrainingArguments(
        output_dir='/tmp/mdeberta_val',
        num_train_epochs=config.get('epochs', 5),
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size']*2,
        learning_rate=config['lr'],
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=False,  # ‚ö†Ô∏è DESATIVADO para mDeBERTa
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        report_to='none',
        seed=SEED,
    )
    
    # Trainer (Focal ou normal)
    if config.get('focal_gamma'):
        trainer = FocalLossTrainer(
            focal_gamma=config['focal_gamma'],
            focal_alpha=config.get('focal_alpha', 0.25),
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )
    else:
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )
    
    # Treinar
    import time
    start = time.time()
    trainer.train()
    train_time = time.time() - start
    
    # Avaliar
    results = trainer.evaluate()
    results['train_time'] = train_time
    results['config'] = str(config)
    
    # Predi√ß√µes para confusion matrix
    preds = trainer.predict(val_ds)
    y_pred = np.argmax(preds.predictions, axis=1)
    
    print(f"\nüìä Resultados:")
    print(f"F1-Macro: {results['eval_f1_macro']:.5f}")
    print(f"Tempo: {train_time/60:.1f} min")
    print(f"\nClassification Report:")
    print(classification_report(val_labels, y_pred))
    
    # Cleanup
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return results, y_pred

In [None]:
# ===== CARREGAR TOKENIZER =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print(f'Tokenizer carregado: {tokenizer.name_or_path}')

In [None]:
# ===== EXPERIMENTO 1: Baseline CrossEntropy =====
config_baseline = {
    'name': 'CE_baseline',
    'lr': 2e-5,
    'batch_size': 8,
    'max_length': 256,
    'epochs': 5,
}

results_baseline, preds_baseline = train_and_evaluate(config_baseline, tokenizer)

In [None]:
# ===== EXPERIMENTO 2: Focal Loss Œ≥=2 =====
config_focal2 = {
    'name': 'Focal_g2',
    'lr': 2e-5,
    'batch_size': 8,
    'max_length': 256,
    'epochs': 5,
    'focal_gamma': 2.0,
    'focal_alpha': 0.25,
}

results_focal2, preds_focal2 = train_and_evaluate(config_focal2, tokenizer)

In [None]:
# ===== EXPERIMENTO 3: Focal Loss Œ≥=3 =====
config_focal3 = {
    'name': 'Focal_g3',
    'lr': 2e-5,
    'batch_size': 8,
    'max_length': 256,
    'epochs': 5,
    'focal_gamma': 3.0,
    'focal_alpha': 0.25,
}

results_focal3, preds_focal3 = train_and_evaluate(config_focal3, tokenizer)

In [None]:
# ===== EXPERIMENTO 4: LR mais baixo =====
config_lr_low = {
    'name': 'Focal_lr1e5',
    'lr': 1e-5,
    'batch_size': 8,
    'max_length': 256,
    'epochs': 5,
    'focal_gamma': 2.0,
    'focal_alpha': 0.25,
}

results_lr_low, preds_lr_low = train_and_evaluate(config_lr_low, tokenizer)

In [None]:
# ===== EXPERIMENTO 5: Max Length 512 =====
config_maxlen = {
    'name': 'Focal_maxlen512',
    'lr': 2e-5,
    'batch_size': 4,  # Menor batch para caber em mem√≥ria
    'max_length': 512,
    'epochs': 5,
    'focal_gamma': 2.0,
    'focal_alpha': 0.25,
}

results_maxlen, preds_maxlen = train_and_evaluate(config_maxlen, tokenizer)

In [None]:
# ===== RESUMO DOS RESULTADOS =====
all_results = [
    ('CE_baseline', results_baseline),
    ('Focal_g2', results_focal2),
    ('Focal_g3', results_focal3),
    ('Focal_lr1e5', results_lr_low),
    ('Focal_maxlen512', results_maxlen),
]

print("\n" + "="*70)
print("üìä RESUMO - mDeBERTa-v3 Validation")
print("="*70)
print(f"{'Config':<20} {'F1-Macro':>10} {'Tempo (min)':>12}")
print("-"*70)

best_f1 = 0
best_config = None

for name, res in all_results:
    f1 = res['eval_f1_macro']
    time_min = res['train_time'] / 60
    marker = " üèÜ" if f1 > best_f1 else ""
    print(f"{name:<20} {f1:>10.5f} {time_min:>12.1f}{marker}")
    if f1 > best_f1:
        best_f1 = f1
        best_config = name

print("-"*70)
print(f"üèÜ Melhor: {best_config} com F1-Macro = {best_f1:.5f}")
print(f"\nüìù Refer√™ncia: BERTimbau v4 = 0.82073 (public score)")

In [None]:
# ===== CONFUSION MATRIX DO MELHOR =====
# Usar as predi√ß√µes do melhor modelo
best_preds = preds_focal2  # Ajustar conforme resultado

cm = confusion_matrix(val_labels, best_preds)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=range(7), yticklabels=range(7))
plt.xlabel('Predito')
plt.ylabel('Real')
plt.title('Confusion Matrix - mDeBERTa-v3 (Melhor Config)')
plt.tight_layout()
plt.savefig('mdeberta_confusion_matrix.png', dpi=150)
plt.show()

In [None]:
# ===== INSIGHTS PARA DOCUMENTAR =====
print("""
üìù INSIGHTS - mDeBERTa-v3
========================

1. **FP32 Fix Funciona:**
   - model.float() resolve o problema de gradientes inst√°veis
   - fp16=False obrigat√≥rio

2. **Focal Loss:**
   - Œ≥=2 √© o valor √≥timo (mesmo que BERTimbau)
   - Melhora F1 em classes minorit√°rias

3. **Learning Rate:**
   - 2e-5 √© adequado (mesmo que BERTimbau)
   - 1e-5 converge mais lento mas pode ser mais est√°vel

4. **Max Length:**
   - 256 suficiente para maioria dos textos
   - 512 n√£o traz ganho significativo

5. **Compara√ß√£o com BERTimbau:**
   - [PREENCHER AP√ìS EXPERIMENTOS]

6. **Pr√≥ximos Passos:**
   - Submeter melhor configura√ß√£o
   - Testar threshold tuning
   - Considerar para ensemble
""")