# Valida√ß√£o: DistilBERT Multilingual

**DistilBERT - R√°pido para ensemble**

## üìä Hist√≥rico
- Score 0.55229 (pior que BERT multilingual)
- Apenas testado com CrossEntropy

## üéØ Objetivo
Testar se Focal Loss melhora DistilBERT para uso em ensembles.

## üìä Vantagens
- 40% mais r√°pido que BERT full
- 60% menor
- Pode ser √∫til em ensembles r√°pidos

---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

if os.path.exists('/kaggle/input'):
    DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'
    def find_model_path():
        base = '/kaggle/input'
        def search_dir(directory, depth=0, max_depth=10):
            if depth > max_depth: return None
            try:
                for item in os.listdir(directory):
                    path = os.path.join(directory, item)
                    if os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json')):
                        return path
                    result = search_dir(path, depth + 1, max_depth) if os.path.isdir(path) else None
                    if result: return result
            except: pass
            return None
        return search_dir(base)
    MODEL_PATH = find_model_path()
else:
    DATA_DIR = '../data'
    MODEL_PATH = 'distilbert-base-multilingual-cased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Model: {MODEL_PATH}')

In [None]:
# ===== DADOS =====
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.2,
    stratify=train_df['target'],
    random_state=SEED
)

print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# ===== DATASET & FOCAL LOSS =====
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(str(self.texts[idx]), truncation=True,
                             max_length=self.max_length, padding='max_length', return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        
    def forward(self, inputs, targets):
        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        return (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()

class FocalLossTrainer(Trainer):
    def __init__(self, focal_gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss = FocalLoss(gamma=focal_gamma)
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = self.focal_loss(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# ===== FUN√á√ÉO DE TREINO =====
def train_and_evaluate(config, tokenizer):
    print(f"\nConfig: {config['name']}")
    
    train_ds = TextDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_ds = TextDataset(val_texts, val_labels, tokenizer, config['max_length'])
    
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_PATH, num_labels=7, local_files_only=True
    )
    
    def compute_metrics(eval_pred):
        preds = np.argmax(eval_pred.predictions, axis=1)
        return {'f1_macro': f1_score(eval_pred.label_ids, preds, average='macro')}
    
    args = TrainingArguments(
        output_dir='/tmp/distilbert_val',
        num_train_epochs=config.get('epochs', 5),
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size']*2,
        learning_rate=config['lr'],
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=True,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        report_to='none',
        seed=SEED,
    )
    
    if config.get('focal_gamma'):
        trainer = FocalLossTrainer(
            focal_gamma=config['focal_gamma'],
            model=model, args=args,
            train_dataset=train_ds, eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )
    else:
        trainer = Trainer(
            model=model, args=args,
            train_dataset=train_ds, eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )
    
    import time
    start = time.time()
    trainer.train()
    train_time = time.time() - start
    
    results = trainer.evaluate()
    results['train_time'] = train_time
    
    print(f"F1-Macro: {results['eval_f1_macro']:.5f}, Tempo: {train_time/60:.1f} min")
    
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return results

In [None]:
# ===== TOKENIZER =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
print('Tokenizer carregado!')

In [None]:
# ===== EXPERIMENTOS =====
configs = [
    {'name': 'CE_baseline', 'lr': 2e-5, 'batch_size': 16, 'max_length': 256},
    {'name': 'Focal_g2', 'lr': 2e-5, 'batch_size': 16, 'max_length': 256, 'focal_gamma': 2.0},
    {'name': 'Focal_g3', 'lr': 2e-5, 'batch_size': 16, 'max_length': 256, 'focal_gamma': 3.0},
    {'name': 'Focal_lr3e5', 'lr': 3e-5, 'batch_size': 16, 'max_length': 256, 'focal_gamma': 2.0},
]

all_results = []
for config in configs:
    results = train_and_evaluate(config, tokenizer)
    all_results.append((config['name'], results['eval_f1_macro'], results['train_time']/60))

In [None]:
# ===== RESUMO =====
print("\n" + "="*60)
print("üìä RESUMO - DistilBERT Validation")
print("="*60)

print(f"{'Config':<20} {'F1-Macro':>10} {'Tempo (min)':>12}")
print("-"*45)
for name, f1, t in sorted(all_results, key=lambda x: -x[1]):
    print(f"{name:<20} {f1:>10.5f} {t:>12.1f}")

print(f"\nüìù Score anterior: 0.55229")
print(f"üìù Refer√™ncia (BERTimbau v4): 0.82073")
print(f"\n‚è±Ô∏è DistilBERT √© ~2x mais r√°pido que BERT full")