In [9]:
# Instalar dependencias faltantes (ejecuta solo si es necesario)
%pip install -q transformers accelerate tiktoken protobuf sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [18]:
print("Hello1");

Hello1


## 📚 Imports y Configuración - DeBERTa-v3-base

Implementación completa con todas las técnicas avanzadas recomendadas.

In [16]:
# ============================================================================
# IMPORTS Y CONFIGURACIÓN INICIAL - DEBERTA-V3-BASE
# ============================================================================

import os
import re
import random
import warnings
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DebertaV2Tokenizer,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    set_seed
)

# Desactivar warnings
warnings.filterwarnings("ignore")
set_seed(42)

# ============================================================================
# CONFIGURACIÓN GLOBAL
# ============================================================================

class Config:
    # Modelo
    MODEL_NAME = "microsoft/deberta-v3-base"
    MAX_LEN = 128  # Versos cortos (96-128)
    NUM_LABELS = 3  # Excluyendo clase 3 (mixed)

    # Entrenamiento
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5  # 1e-5 a 2e-5
    NUM_EPOCHS = 6  # 3-6 épocas
    WARMUP_RATIO = 0.05  # 5% warmup
    WEIGHT_DECAY = 0.01

    # Regularización
    LABEL_SMOOTHING = 0.1  # 0.05-0.1
    FOCAL_GAMMA = 1.5  # γ=1-2 para Focal Loss
    R_DROP_ALPHA = 4.0  # α≈4 para R-Drop

    # Validación
    N_FOLDS = 5  # Stratified k-fold
    EARLY_STOPPING_PATIENCE = 3

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()

print(f"✅ Configuración cargada")
print(f"   Device: {config.DEVICE}")
print(f"   Modelo: {config.MODEL_NAME}")
print(f"   Max length: {config.MAX_LEN}")
print(f"   Batch size: {config.BATCH_SIZE}")

# ============================================================================
# PREPROCESAMIENTO (sin stemming/stopwords - preservar matices)
# ============================================================================

def advanced_text_preprocessing(text: str) -> str:
    """
    Preprocesamiento para texto poético:
    - Normalizar comillas y guiones
    - NO stemming/stopwords (preservar matices)
    """
    if not isinstance(text, str):
        return ""

    # Normalizar comillas dobles
    text = text.replace('"', '"').replace('"', '"')
    
    # Normalizar comillas simples
    text = text.replace(''', "'").replace(''', "'")

    # Normalizar guiones
    text = text.replace('–', '-').replace('—', '-')

    # Eliminar espacios múltiples
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def remove_duplicate_verses(df: pd.DataFrame) -> pd.DataFrame:
    """Eliminar versos duplicados."""
    initial = len(df)
    df_clean = df.drop_duplicates(subset=['verse_text'], keep='first')
    print(f"   Eliminados {initial - len(df_clean)} versos duplicados")
    return df_clean

# ============================================================================
# FOCAL LOSS PARA DESBALANCE
# ============================================================================

class FocalLoss(nn.Module):
    """Focal Loss con class weights."""
    def __init__(self, alpha=None, gamma=config.FOCAL_GAMMA, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss

# ============================================================================
# R-DROP REGULARIZATION
# ============================================================================

def r_drop_loss(logits1, logits2, targets, alpha=config.R_DROP_ALPHA):
    """R-Drop: α≈4 para mejor generalización."""
    loss1 = F.cross_entropy(logits1, targets)
    loss2 = F.cross_entropy(logits2, targets)
    
    kl_loss = F.kl_div(
        F.log_softmax(logits1, dim=-1),
        F.softmax(logits2, dim=-1),
        reduction='batchmean'
    )
    
    return (loss1 + loss2) / 2 + alpha * kl_loss

# ============================================================================
# DATASET CUSTOM
# ============================================================================

class PoemDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=config.MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ============================================================================
# CUSTOM TRAINER CON FOCAL LOSS Y R-DROP
# ============================================================================

class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, use_r_drop=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.use_r_drop = use_r_drop

        if class_weights is not None:
            self.criterion = FocalLoss(alpha=class_weights.to(self.model.device))
        else:
            self.criterion = FocalLoss()

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")

        if self.use_r_drop:
            outputs1 = model(**inputs)
            outputs2 = model(**inputs)
            loss = r_drop_loss(outputs1.logits, outputs2.logits, labels)
        else:
            outputs = model(**inputs)
            loss = self.criterion(outputs.logits, labels)

        return (loss, outputs if not self.use_r_drop else outputs1) if return_outputs else loss

# ============================================================================
# MÉTRICAS
# ============================================================================

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted')
    }

# ============================================================================
# STRATIFIED K-FOLD TRAINING
# ============================================================================

def stratified_kfold_training(train_texts, train_labels, config):
    """Entrenamiento con Stratified K-Fold (5 folds)."""
    
    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=42)
    fold_results = []
    best_models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_texts, train_labels)):
        print(f"\n{'='*50}")
        print(f"FOLD {fold + 1}/{config.N_FOLDS}")
        print(f"{'='*50}")

        # Split data
        fold_train_texts = [train_texts[i] for i in train_idx]
        fold_train_labels = [train_labels[i] for i in train_idx]
        fold_val_texts = [train_texts[i] for i in val_idx]
        fold_val_labels = [train_labels[i] for i in val_idx]

        # Class weights
        class_weights = compute_class_weight(
            'balanced',
            classes=np.unique(fold_train_labels),
            y=fold_train_labels
        )
        class_weights = torch.tensor(class_weights, dtype=torch.float)
        
        # Tokenizer y modelo
        tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(
            config.MODEL_NAME,
            num_labels=config.NUM_LABELS
        )

        # Datasets
        train_dataset = PoemDataset(fold_train_texts, fold_train_labels, tokenizer)
        val_dataset = PoemDataset(fold_val_texts, fold_val_labels, tokenizer)

        # Training arguments (AdamW, cosine decay, warmup 5%)
        training_args = TrainingArguments(
            output_dir=f"./results/fold_{fold}",
            num_train_epochs=config.NUM_EPOCHS,
            per_device_train_batch_size=config.BATCH_SIZE,
            per_device_eval_batch_size=config.BATCH_SIZE * 2,
            learning_rate=config.LEARNING_RATE,
            weight_decay=config.WEIGHT_DECAY,
            warmup_ratio=config.WARMUP_RATIO,
            lr_scheduler_type="cosine",
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
            logging_steps=50,
            report_to="none",
            label_smoothing_factor=config.LABEL_SMOOTHING
        )

        # Trainer con R-Drop
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)],
            class_weights=class_weights,
            use_r_drop=True
        )

        # Entrenar
        trainer.train()

        # Evaluar
        eval_results = trainer.evaluate()
        fold_results.append(eval_results)
        best_models.append(trainer.model)

        print(f"Fold {fold + 1} - F1 Macro: {eval_results['eval_f1_macro']:.4f}")

    return fold_results, best_models

# ============================================================================
# ENSEMBLE DE LOGITS
# ============================================================================

def ensemble_predictions(models, tokenizer, test_texts, config):
    """Ensamblar logits de múltiples checkpoints."""
    all_predictions = []

    test_dataset = PoemDataset(test_texts, [0]*len(test_texts), tokenizer)

    for model in models:
        model.eval()
        model.to(config.DEVICE)
        predictions = []

        dataloader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE * 4, shuffle=False)

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(config.DEVICE)
                attention_mask = batch['attention_mask'].to(config.DEVICE)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predictions.extend(outputs.logits.cpu().numpy())

        all_predictions.append(np.array(predictions))

    # Promediar logits
    ensemble_logits = np.mean(all_predictions, axis=0)
    ensemble_preds = np.argmax(ensemble_logits, axis=1)

    return ensemble_preds, ensemble_logits

print("\n✅ Todas las funciones cargadas correctamente")
print("🚀 Listo para entrenar DeBERTa-v3-base con técnicas avanzadas")

✅ Configuración cargada
   Device: cpu
   Modelo: microsoft/deberta-v3-base
   Max length: 128
   Batch size: 16

✅ Todas las funciones cargadas correctamente
🚀 Listo para entrenar DeBERTa-v3-base con técnicas avanzadas


In [13]:
# ============================================================================
# EVALUACIÓN FINAL EN TEST SET CON ENSEMBLE
# ============================================================================

print("🎯 Evaluando en conjunto de test con ensemble de modelos...")

# Cargar tokenizer (usaremos el del último fold, pero todos son iguales)
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

# Ensemble predictions
ensemble_preds, ensemble_logits = ensemble_predictions(
    best_models, 
    tokenizer, 
    test_texts, 
    config
)

# Métricas finales
final_accuracy = accuracy_score(test_labels, ensemble_preds)
final_f1_macro = f1_score(test_labels, ensemble_preds, average='macro')
final_f1_weighted = f1_score(test_labels, ensemble_preds, average='weighted')

print("\n" + "="*70)
print("🏆 RESULTADOS FINALES EN TEST SET")
print("="*70)
print(f"✅ F1 Macro (Objetivo: ≥0.85):    {final_f1_macro:.4f}")
print(f"   F1 Weighted:                   {final_f1_weighted:.4f}")
print(f"   Accuracy:                      {final_accuracy:.4f}")
print("="*70)

if final_f1_macro >= 0.85:
    print("🎉 ¡OBJETIVO ALCANZADO! F1-macro ≥ 0.85")
else:
    gap = 0.85 - final_f1_macro
    print(f"⚠️  Falta {gap:.4f} para alcanzar el objetivo de 0.85")

# Classification Report
print("\n📋 Classification Report:")
print("-" * 70)
print(classification_report(test_labels, ensemble_preds, digits=4))

# Confusion Matrix
print("\n🔍 Confusion Matrix:")
cm = confusion_matrix(test_labels, ensemble_preds)
print(cm)

# Visualización
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Confusion Matrix - Test Set')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')

# F1 Scores por clase
class_f1_scores = f1_score(test_labels, ensemble_preds, average=None)
axes[1].bar(range(len(class_f1_scores)), class_f1_scores, color=['#2ecc71', '#e74c3c', '#3498db'])
axes[1].set_xlabel('Clase')
axes[1].set_ylabel('F1 Score')
axes[1].set_title('F1 Score por Clase - Test Set')
axes[1].set_xticks(range(len(class_f1_scores)))
axes[1].axhline(y=0.85, color='red', linestyle='--', label='Objetivo (0.85)')
axes[1].legend()
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("\n✅ Evaluación completada")

🎯 Evaluando en conjunto de test con ensemble de modelos...


NameError: name 'best_models' is not defined

## 🎯 Evaluación Final con Ensemble

Evaluar en el conjunto de test usando ensemble de los 5 mejores modelos.

In [17]:
# ============================================================================
# ENTRENAR CON STRATIFIED K-FOLD (5 FOLDS)
# ============================================================================

print("🚀 Iniciando entrenamiento con Stratified K-Fold...")
print(f"⚙️  Configuración:")
print(f"   Modelo: {config.MODEL_NAME}")
print(f"   Folds: {config.N_FOLDS}")
print(f"   Epochs: {config.NUM_EPOCHS}")
print(f"   Batch size: {config.BATCH_SIZE}")
print(f"   Learning rate: {config.LEARNING_RATE}")
print(f"   Focal Loss γ: {config.FOCAL_GAMMA}")
print(f"   R-Drop α: {config.R_DROP_ALPHA}")
print(f"   Label smoothing: {config.LABEL_SMOOTHING}")
print(f"   Early stopping patience: {config.EARLY_STOPPING_PATIENCE}")
print("\n" + "="*70)

# NOTA: Este entrenamiento puede tomar VARIAS HORAS en CPU
# Cada fold con 6 épocas puede tomar ~1-2 horas
# Total estimado: 5-10 horas para 5 folds

fold_results, best_models = stratified_kfold_training(
    train_texts, 
    train_labels, 
    config
)

print("\n" + "="*70)
print("✅ Entrenamiento K-Fold completado!")
print("\n📊 Resumen de resultados por fold:")
print("-" * 70)

for i, results in enumerate(fold_results):
    print(f"Fold {i+1}:")
    print(f"  F1 Macro:    {results['eval_f1_macro']:.4f}")
    print(f"  F1 Weighted: {results['eval_f1_weighted']:.4f}")
    print(f"  Accuracy:    {results['eval_accuracy']:.4f}")

# Promedios
avg_f1_macro = np.mean([r['eval_f1_macro'] for r in fold_results])
avg_f1_weighted = np.mean([r['eval_f1_weighted'] for r in fold_results])
avg_accuracy = np.mean([r['eval_accuracy'] for r in fold_results])
std_f1_macro = np.std([r['eval_f1_macro'] for r in fold_results])

print("-" * 70)
print(f"📈 Promedios ({config.N_FOLDS} folds):")
print(f"  F1 Macro:    {avg_f1_macro:.4f} ± {std_f1_macro:.4f}")
print(f"  F1 Weighted: {avg_f1_weighted:.4f}")
print(f"  Accuracy:    {avg_accuracy:.4f}")
print("="*70)

🚀 Iniciando entrenamiento con Stratified K-Fold...
⚙️  Configuración:
   Modelo: microsoft/deberta-v3-base
   Folds: 5
   Epochs: 6
   Batch size: 16
   Learning rate: 2e-05
   Focal Loss γ: 1.5
   R-Drop α: 4.0
   Label smoothing: 0.1
   Early stopping patience: 3


FOLD 1/5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For be

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.853617,0.657895,0.26455,0.522139
2,0.947100,0.622136,0.768421,0.542468,0.700595
3,0.742800,0.471198,0.857895,0.818079,0.855608
4,0.474700,0.444519,0.826316,0.789629,0.828775
5,0.322300,0.393939,0.873684,0.839238,0.872265
6,0.218200,0.398452,0.868421,0.835113,0.86754


Fold 1 - F1 Macro: 0.8392

FOLD 2/5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For be

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## 🚀 Entrenamiento con Stratified K-Fold

Entrenar DeBERTa-v3-base con Focal Loss, R-Drop y 5-fold cross-validation.

In [8]:
# ============================================================================
# CARGAR Y PREPARAR DATOS
# ============================================================================

print("📥 Cargando dataset poem_sentiment...")

splits = {
    "train": "data/train-00000-of-00001.parquet",
    "validation": "data/validation-00000-of-00001.parquet",
    "test": "data/test-00000-of-00001.parquet",
}
base_uri = "hf://datasets/google-research-datasets/poem_sentiment/"
parquet_engine = "fastparquet"

df_train = pd.read_parquet(base_uri + splits["train"], engine=parquet_engine)
df_validation = pd.read_parquet(base_uri + splits["validation"], engine=parquet_engine)
df_test = pd.read_parquet(base_uri + splits["test"], engine=parquet_engine)

print(f"✅ Datos cargados:")
print(f"   Train: {df_train.shape}")
print(f"   Validation: {df_validation.shape}")
print(f"   Test: {df_test.shape}")

# ============================================================================
# PREPROCESAMIENTO: EXCLUIR CLASE 3 (MIXED)
# ============================================================================

print("\n🔧 Preprocesando datos...")

# Excluir clase 3 (mixed) de todos los splits
df_train_clean = df_train[df_train['label'] != 3].copy()
df_validation_clean = df_validation[df_validation['label'] != 3].copy()
df_test_clean = df_test[df_test['label'] != 3].copy()

print(f"   Train: {len(df_train)} → {len(df_train_clean)} (excluidos {len(df_train) - len(df_train_clean)} mixed)")
print(f"   Validation: {len(df_validation)} → {len(df_validation_clean)} (excluidos {len(df_validation) - len(df_validation_clean)} mixed)")
print(f"   Test: {len(df_test)} → {len(df_test_clean)} (excluidos {len(df_test) - len(df_test_clean)} mixed)")

# Eliminar duplicados
print("\n🗑️  Eliminando duplicados...")
df_train_clean = remove_duplicate_verses(df_train_clean)
df_validation_clean = remove_duplicate_verses(df_validation_clean)
df_test_clean = remove_duplicate_verses(df_test_clean)

# Normalizar texto (sin stemming/stopwords)
print("\n✨ Normalizando texto (preservando matices poéticos)...")
df_train_clean['verse_text'] = df_train_clean['verse_text'].apply(advanced_text_preprocessing)
df_validation_clean['verse_text'] = df_validation_clean['verse_text'].apply(advanced_text_preprocessing)
df_test_clean['verse_text'] = df_test_clean['verse_text'].apply(advanced_text_preprocessing)

# Combinar train + validation para K-Fold
df_train_full = pd.concat([df_train_clean, df_validation_clean], ignore_index=True)

print(f"\n✅ Datos finales:")
print(f"   Train+Validation (para K-Fold): {len(df_train_full)}")
print(f"   Test: {len(df_test_clean)}")

# Distribución de clases
print(f"\n📊 Distribución de clases en Train+Validation:")
for label, count in df_train_full['label'].value_counts().sort_index().items():
    percentage = count / len(df_train_full) * 100
    print(f"   Clase {label}: {count:5d} ({percentage:5.2f}%)")

# Preparar datos para entrenamiento
train_texts = df_train_full['verse_text'].tolist()
train_labels = df_train_full['label'].tolist()

test_texts = df_test_clean['verse_text'].tolist()
test_labels = df_test_clean['label'].tolist()

print(f"\n🎯 Datos listos para entrenamiento con K-Fold")

📥 Cargando dataset poem_sentiment...
✅ Datos cargados:
   Train: (892, 3)
   Validation: (105, 3)
   Test: (104, 3)

🔧 Preprocesando datos...
   Train: 892 → 843 (excluidos 49 mixed)
   Validation: 105 → 105 (excluidos 0 mixed)
   Test: 104 → 104 (excluidos 0 mixed)

🗑️  Eliminando duplicados...
   Eliminados 0 versos duplicados
   Eliminados 0 versos duplicados
   Eliminados 0 versos duplicados

✨ Normalizando texto (preservando matices poéticos)...

✅ Datos finales:
   Train+Validation (para K-Fold): 948
   Test: 104

📊 Distribución de clases en Train+Validation:
   Clase 0:   174 (18.35%)
   Clase 1:   150 (15.82%)
   Clase 2:   624 (65.82%)

🎯 Datos listos para entrenamiento con K-Fold


## 📊 Carga y Preprocesamiento de Datos

Cargar dataset, eliminar clase 3 (mixed), duplicados y normalizar texto.