# Análisis de Clasificación de Emociones en Tweets en Español

## 1. Importación de Librerías y Configuración Inicial

Importo todas las librerías necesarias para el análisis exploratorio, preprocesamiento de datos y construcción del modelo.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, MaxPooling1D, Dense, Dropout,
    GlobalMaxPooling1D, LayerNormalization, Attention
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
!pip install transformers datasets accelerate -q

## 2. Descarga y Carga del Dataset EmoEvent

Descargo el dataset EmoEvent desde su repositorio de GitHub y cargo la versión en español.

In [None]:
import os

repo_url = "https://github.com/fmplaza/EmoEvent.git"
repo_dir = "EmoEvent"

# Descargar y descomprimir los embeddings de FastText en español si no existen
fasttext_file = 'cc.es.300.vec'
if not os.path.exists(fasttext_file):
    print("Descargando embeddings de FastText...")
    os.system('wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz')
    print("Descomprimiendo embeddings...")
    os.system('gunzip cc.es.300.vec.gz')
else:
    print(f"El archivo de embeddings '{fasttext_file}' ya existe.")

if not os.path.exists(repo_dir):
    print(f"Clonando repositorio desde {repo_url}...")
    os.system(f"git clone {repo_url}")
else:
    print(f"El repositorio '{repo_dir}' ya existe.")

data_dir = os.path.join(repo_dir, 'splits', 'es')
files = {
    'train': os.path.join(data_dir, 'train.tsv'),
    'dev': os.path.join(data_dir, 'dev.tsv'),
    'test': os.path.join(data_dir, 'test.tsv')
}

all_files_exist = True
for split, filepath in files.items():
    if not os.path.exists(filepath):
        all_files_exist = False

if all_files_exist:
    print("Archivos del dataset listos para cargar.")

In [None]:
def load_dataset(filepath):
    df = pd.read_csv(filepath, sep='\t', header=0)
    # Corregir los nombres de las columnas que están mal en el archivo original
    df.rename(columns={'tweet': 'text', 'offensive': 'is_offensive', 'emotion': 'emotion_label'}, inplace=True)
    # Reordenar y seleccionar las columnas deseadas
    df = df[['event', 'text', 'emotion_label', 'is_offensive']]
    df.columns = ['event', 'text', 'emotion', 'is_offensive']
    return df

train_df = load_dataset('EmoEvent/splits/es/train.tsv')
dev_df = load_dataset('EmoEvent/splits/es/dev.tsv')
test_df = load_dataset('EmoEvent/splits/es/test.tsv')

full_df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

print(f"Dataset completo cargado:")
print(f"- Entrenamiento: {len(train_df)} tweets")
print(f"- Desarrollo: {len(dev_df)} tweets")
print(f"- Prueba: {len(test_df)} tweets")
print(f"- Total: {len(full_df)} tweets")

display(full_df.head())

## 3. Análisis Exploratorio de Datos (EDA)

Realizo un análisis del dataset para comprender la distribución de emociones, eventos y características del texto.

### 3.1 Información General y Limpieza del Dataset

Examino la estructura del dataset, valores nulos y tipos de datos. También realizo una limpieza inicial.

In [None]:
print("Forma del dataset original:", full_df.shape)
print("\nValores nulos antes de la limpieza:")
print(full_df.isnull().sum())

# Limpieza de valores nulos
full_df.dropna(inplace=True)

print("\nForma del dataset después de la limpieza:", full_df.shape)
print("\nValores nulos después de la limpieza:")
print(full_df.isnull().sum())

# Estadísticas descriptivas del texto
full_df['text_length'] = full_df['text'].str.len()
full_df['word_count'] = full_df['text'].str.split().str.len()
print(full_df[['text_length', 'word_count']].describe())

# Calcular percentiles para la longitud de las secuencias
percentile_95 = int(full_df['word_count'].quantile(0.95))
percentile_99 = int(full_df['word_count'].quantile(0.99))

print(f"\nAnálisis de Longitud de Secuencia:")
print(f"El 95% de los tweets tienen {percentile_95} palabras o menos.")
print(f"El 99% de los tweets tienen {percentile_99} palabras o menos.")

### 3.2 Distribución de Emociones

Analizo la distribución de las categorías emocionales en el dataset.

In [None]:
emotion_counts = full_df['emotion'].value_counts()
emotion_percentages = full_df['emotion'].value_counts(normalize=True) * 100

emotion_stats = pd.DataFrame({
    'Cantidad': emotion_counts,
    'Porcentaje': emotion_percentages.round(2)
})
print(emotion_stats)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

emotion_counts.plot(kind='bar', ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Distribución de Emociones')
axes[0].set_xlabel('Emociones')
axes[0].set_ylabel('Cantidad de Tweets')
axes[0].tick_params(axis='x', rotation=45)

axes[1].pie(emotion_counts.values, labels=emotion_counts.index, autopct='%1.1f%%',
           startangle=90, colors=sns.color_palette('husl', len(emotion_counts)))
axes[1].set_title('Proporción de Emociones')

plt.tight_layout()
plt.show()

### 3.3 Análisis por Eventos

Exploro la distribución de emociones por evento.

In [None]:
event_counts = full_df['event'].value_counts()
print(f"Número total de eventos únicos: {len(event_counts)}")
print("\nTop 10 eventos más frecuentes:")
print(event_counts.head(10))

top_events = event_counts.head(10).index
emotion_event_matrix = pd.crosstab(full_df[full_df['event'].isin(top_events)]['event'],
                                  full_df[full_df['event'].isin(top_events)]['emotion'])

plt.figure(figsize=(12, 8))
sns.heatmap(emotion_event_matrix, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Distribución de Emociones por Evento (Top 10)')
plt.xlabel('Emociones')
plt.ylabel('Eventos')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### 3.4 Análisis de Características del Texto

In [None]:
plt.figure(figsize=(14, 10))
plt.subplot(2, 2, 1)
sns.histplot(data=full_df, x='text_length', hue='emotion', multiple='stack', bins=30)
plt.title('Distribución de Longitud por Emoción')

plt.subplot(2, 2, 2)
sns.boxplot(data=full_df, x='emotion', y='text_length')
plt.xticks(rotation=45)
plt.title('Longitud de Texto por Emoción')

plt.tight_layout()
plt.show()

text_stats = full_df.groupby('emotion')[['text_length', 'word_count']].agg(['mean', 'std', 'min', 'max'])
print(text_stats.round(2))

In [None]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

def basic_preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(text.split())
    return text

full_df['text_processed'] = full_df['text'].apply(basic_preprocess)
full_df_cleaned = full_df.dropna(subset=['emotion']).copy()

spanish_stopwords = set(stopwords.words('spanish'))
additional_stopwords = {'rt', 'via', 'q', 'si', 'ya', 'ser', 'estar', 'tener', 'hacer'}
spanish_stopwords.update(additional_stopwords)

def get_top_words(texts, n=20):
    all_words = []
    for text in texts:
        words = word_tokenize(text)
        words = [word for word in words if word not in spanish_stopwords and len(word) > 2]
        all_words.extend(words)
    return Counter(all_words).most_common(n)

for emotion in full_df_cleaned['emotion'].unique():
    emotion_texts = full_df_cleaned[full_df_cleaned['emotion'] == emotion]['text_processed'].tolist()
    top_words = get_top_words(emotion_texts, 10)
    print(f"\n{emotion.upper()}:")
    for word, count in top_words:
        print(f"  {word}: {count}")

### 3.5 Visualización con Nubes de Palabras

In [None]:
emotions = full_df_cleaned['emotion'].unique()
n_emotions = len(emotions)
cols = 3
rows = (n_emotions + cols - 1) // cols

plt.figure(figsize=(18, 6 * rows))

for i, emotion in enumerate(emotions):
    plt.subplot(rows, cols, i + 1)

    emotion_texts = ' '.join(full_df_cleaned[full_df_cleaned['emotion'] == emotion]['text_processed'].dropna())

    if emotion_texts.strip():
        wordcloud = WordCloud(
            width=400, height=300,
            background_color='white',
            stopwords=spanish_stopwords,
            max_words=50,
            colormap='viridis'
        ).generate(emotion_texts)

        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Nube de Palabras: {emotion.upper()}')
        plt.axis('off')
    else:
        plt.text(0.5, 0.5, f'No hay suficientes palabras para\n "{emotion.upper()}"',
                 horizontalalignment='center', verticalalignment='center',
                 fontsize=12, color='gray')
        plt.title(f'Nube de Palabras: {emotion.upper()}')
        plt.axis('off')

plt.tight_layout()
plt.show()

## 4. Preprocesamiento de Datos para Modelado

Preparo los datos para el entrenamiento del modelo, incluyendo tokenización, codificación de etiquetas y división de conjuntos.

In [None]:
from nltk.corpus import stopwords

# Definir stopwords en español y añadir algunas personalizadas
spanish_stopwords = set(stopwords.words('spanish'))
# Eliminar palabras de negación de la lista de stopwords, ya que son importantes para el sentimiento
negations = {'no', 'ni', 'ninguno', 'nada'}
spanish_stopwords = spanish_stopwords - negations
# Los tokens de marcador de posición para URL, MENTION, etc. todavía son útiles para eliminar
additional_stopwords = {'rt', 'via', 'q', 'si', 'ya', 'ser', 'estar', 'tener', 'hacer', 'url', 'mention', 'hashtag', 'exclamation', 'question'}
spanish_stopwords.update(additional_stopwords)

def advanced_preprocess(text):
    # Ensure text is a string before processing
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text

# Combine train and dev dataframes before preprocessing
train_combined = pd.concat([train_df, dev_df], ignore_index=True)

# Convert 'text' columns to string and fill potential NaN values before applying preprocess
train_combined['text_clean'] = train_combined['text'].astype(str).fillna('').apply(advanced_preprocess)
test_df['text_clean'] = test_df['text'].astype(str).fillna('').apply(advanced_preprocess)

In [None]:
test_df_cleaned = test_df.dropna(subset=['emotion']).copy()

label_encoder = LabelEncoder()
train_combined['emotion_encoded'] = label_encoder.fit_transform(train_combined['emotion'])
test_df_cleaned['emotion_encoded'] = label_encoder.transform(test_df_cleaned['emotion'])

emotion_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
num_classes = len(label_encoder.classes_)

# Feature Engineering
event_one_hot = pd.get_dummies(train_combined['event'], prefix='event')
is_offensive_feature = (train_combined['is_offensive'] == 'yes').astype(int)
additional_features = pd.concat([event_one_hot, is_offensive_feature.rename('is_offensive')], axis=1)

event_one_hot_test = pd.get_dummies(test_df_cleaned['event'], prefix='event')
is_offensive_feature_test = (test_df_cleaned['is_offensive'] == 'yes').astype(int)
additional_features_test = pd.concat([event_one_hot_test, is_offensive_feature_test.rename('is_offensive')], axis=1)

# Align columns between train and test
train_cols = additional_features.columns
test_cols = additional_features_test.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    additional_features_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    additional_features[c] = 0
additional_features_test = additional_features_test[additional_features.columns]

X_train_text = train_combined['text_clean'].values
X_train_features = additional_features.values
y_train = train_combined['emotion_encoded'].values
X_test_text = test_df_cleaned['text_clean'].values
X_test_features = additional_features_test.values
y_test = test_df_cleaned['emotion_encoded'].values

X_train_text_split, X_val_text_split, X_train_features_split, X_val_features_split, y_train_split, y_val_split = train_test_split(
    X_train_text, X_train_features, y_train, test_size=0.2, random_state=42, stratify=y_train
)

## 5. Tokenización y Preparación de Secuencias

In [None]:
MAX_VOCAB_SIZE = 15000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300  # Actualizado para coincidir con FastText

tokenizer = Tokenizer(
    num_words=MAX_VOCAB_SIZE,
    oov_token='<OOV>',
    filters='',
    lower=False
)

tokenizer.fit_on_texts(X_train_text_split)

X_train_seq = tokenizer.texts_to_sequences(X_train_text_split)
X_val_seq = tokenizer.texts_to_sequences(X_val_text_split)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1

y_train_cat = tf.keras.utils.to_categorical(y_train_split, num_classes)
y_val_cat = tf.keras.utils.to_categorical(y_val_split, num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes)

# --- Cargar Embeddings de FastText y Crear Matriz de Embeddings ---
print('Cargando vectores de palabras de FastText...')
embeddings_index = {}
with open('cc.es.300.vec', 'r', encoding='utf-8') as f:
    # La primera línea en el archivo .vec es el número de palabras y la dimensión
    next(f)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f'Se encontraron {len(embeddings_index)} vectores de palabras.')

# Crear una matriz de pesos para las palabras en los documentos de entrenamiento
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Las palabras no encontradas en el índice de embeddings serán ceros.
            embedding_matrix[i] = embedding_vector

## 6. Construcción del Modelo Ensemble: BiLSTM + Attention

### 6.1 Definición de la Arquitectura del Modelo

In [None]:
from tensorflow.keras.layers import concatenate

def create_multi_input_model(vocab_size, embedding_dim, max_length, num_classes, num_additional_features):
    # Text input branch
    input_text = Input(shape=(max_length,), name='input_text')
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False,
        name='embedding'
    )(input_text)
    embedding_dropout = Dropout(0.2, name='embedding_dropout')(embedding)

    # BiLSTM layer
    bilstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)
    )(embedding_dropout)

    # Attention layer
    attention = Attention(name='attention')([bilstm, bilstm])

    # Pooling
    pool = GlobalMaxPooling1D(name='maxpool')(attention)

    # Additional features input branch
    input_features = Input(shape=(num_additional_features,), name='input_features')

    # Concatenate text features with additional features
    concatenated = concatenate([pool, input_features])

    # Dense layers for classification
    dense1 = Dense(128, activation='relu', name='dense_1')(concatenated)
    dense1_dropout = Dropout(0.5, name='dense_dropout_1')(dense1)

    output = Dense(num_classes, activation='softmax', name='output')(dense1_dropout)

    model = Model(inputs=[input_text, input_features], outputs=output, name='BiLSTM_Attention_Features_Model')
    return model

model = create_multi_input_model(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    max_length=MAX_SEQUENCE_LENGTH,
    num_classes=num_classes,
    num_additional_features=X_train_features.shape[1]
)
model.summary()

### 6.2 Compilación del Modelo

In [None]:
import tensorflow.keras.backend as K

def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1+K.epsilon())) - K.mean((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return focal_loss_fixed

initial_learning_rate = 0.001
optimizer = Adam(learning_rate=initial_learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-7)

model.compile(optimizer=optimizer, loss=focal_loss(), metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
]

# --- Manejo de Desbalance de Clases con Class Weight ---
class_weights_dict = {0: 1.4, 1: 3.0, 2: 4.0, 3: 0.7, 4: 0.5, 5: 1.2, 6: 2.0}

print("Pesos de clase definidos manualmente:")
print(class_weights_dict)

### 6.3 Entrenamiento del Modelo

In [None]:
from sklearn.model_selection import StratifiedKFold

BATCH_SIZE = 100
EPOCHS = 30
N_SPLITS = 5

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

histories = []
scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_text, y_train)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")

    # Split data for this fold
    X_train_text_fold = X_train_text[train_idx]
    X_val_text_fold = X_train_text[val_idx]
    X_train_features_fold = X_train_features[train_idx].astype(np.float32)
    X_val_features_fold = X_train_features[val_idx].astype(np.float32)
    y_train_fold = y_train[train_idx]
    y_val_fold = y_train[val_idx]

    # Tokenize and pad text data for this fold
    tokenizer.fit_on_texts(X_train_text_fold)
    X_train_seq_fold = tokenizer.texts_to_sequences(X_train_text_fold)
    X_val_seq_fold = tokenizer.texts_to_sequences(X_val_text_fold)
    X_train_pad_fold = pad_sequences(X_train_seq_fold, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    X_val_pad_fold = pad_sequences(X_val_seq_fold, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    # Convert labels to categorical
    y_train_cat_fold = tf.keras.utils.to_categorical(y_train_fold, num_classes)
    y_val_cat_fold = tf.keras.utils.to_categorical(y_val_fold, num_classes)

    # Create and compile the model
    model = create_multi_input_model(
        vocab_size=vocab_size,
        embedding_dim=EMBEDDING_DIM,
        max_length=MAX_SEQUENCE_LENGTH,
        num_classes=num_classes,
        num_additional_features=X_train_features.shape[1]
    )
    model.compile(optimizer=Adam(learning_rate=0.001), loss=focal_loss(), metrics=['accuracy'])

    # Train the model
    history = model.fit(
        [X_train_pad_fold, X_train_features_fold],
        y_train_cat_fold,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=([X_val_pad_fold, X_val_features_fold], y_val_cat_fold),
        callbacks=callbacks,
        class_weight=class_weights_dict,
        verbose=1
    )
    histories.append(history)

    # Evaluate on the test set
    X_test_seq = tokenizer.texts_to_sequences(X_test_text)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    y_pred_proba = model.predict([X_test_pad, X_test_features.astype(np.float32)], batch_size=BATCH_SIZE, verbose=0) # Convert to float32
    y_pred_classes = np.argmax(y_pred_proba, axis=1)
    report = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, output_dict=True)
    scores.append(report)

# --- Aggregate and Display Results ---
avg_accuracy = np.mean([s['accuracy'] for s in scores])
avg_precision = np.mean([s['weighted avg']['precision'] for s in scores])
avg_recall = np.mean([s['weighted avg']['recall'] for s in scores])
avg_f1 = np.mean([s['weighted avg']['f1-score'] for s in scores])

print("\n--- Cross-Validation Results ---")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Weighted Precision: {avg_precision:.4f}")
print(f"Average Weighted Recall: {avg_recall:.4f}")
print(f"Average Weighted F1-Score: {avg_f1:.4f}")

## 7. Modelo 2: DeBERTa-v3 con Fine-tuning Avanzado

Ahora, implementaremos un modelo de Transformer de última generación, DeBERTa-v3, y lo afinaremos para nuestra tarea de clasificación de emociones.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_NAME = 'microsoft/deberta-v3-base'

# Load tokenizer
deberta_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# We will use the previously cleaned text
train_dataset_hf = Dataset.from_pandas(train_combined[['text_clean', 'emotion_encoded']])
test_dataset_hf = Dataset.from_pandas(test_df_cleaned[['text_clean', 'emotion_encoded']])

# Tokenization function
def tokenize_function(examples):
    return deberta_tokenizer(examples['text_clean'], padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH)

# Apply tokenization
train_tokenized_dataset = train_dataset_hf.map(tokenize_function, batched=True)
test_tokenized_dataset = test_dataset_hf.map(tokenize_function, batched=True)

# Rename column to match model's expected input
train_tokenized_dataset = train_tokenized_dataset.rename_column("emotion_encoded", "labels")
test_tokenized_dataset = test_tokenized_dataset.rename_column("emotion_encoded", "labels")

# Convert labels column to ClassLabel for stratification
class_names = label_encoder.classes_.tolist()
num_classes_deberta = len(class_names)
train_tokenized_dataset = train_tokenized_dataset.cast_column("labels", ClassLabel(num_classes=num_classes_deberta, names=class_names))
test_tokenized_dataset = test_tokenized_dataset.cast_column("labels", ClassLabel(num_classes=num_classes_deberta, names=class_names))

# Split the training data into train and validation sets
train_val_split = train_tokenized_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="labels")
train_dataset_deberta = train_val_split['train']
val_dataset_deberta = train_val_split['test']

print("Data prepared for DeBERTa model.")
print(f"Train dataset size: {len(train_dataset_deberta)}")
print(f"Validation dataset size: {len(val_dataset_deberta)}")
print(f"Test dataset size: {len(test_tokenized_dataset)}")

In [None]:
# Ejecuta esta celda primero
import os
from huggingface_hub import logout

# Desconectarse completamente
try:
    logout()
    print("Desconectado de Hugging Face Hub")
except:
    print("No estaba conectado")

# Configurar variables de entorno para modo offline
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"

print("Modo offline configurado")

In [None]:
# Configurar modo offline y desactivar todas las conexiones
import os
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["WANDB_DISABLED"] = "true"

# Crear mappings con tipos nativos de Python usando label_encoder
id2label = {int(i): str(label) for i, label in enumerate(label_encoder.classes_)}
label2id = {str(label): int(i) for i, label in enumerate(label_encoder.classes_)}

# Load model sin parámetros de configuración que puedan causar problemas
try:
    model_deberta = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_classes_deberta,
        local_files_only=True,
        use_auth_token=False
    ).to(device)

    # Configurar los labels después de cargar el modelo
    model_deberta.config.id2label = id2label
    model_deberta.config.label2id = label2id

except Exception as e:
    print(f"Error cargando modelo: {e}")
    # Intentar sin local_files_only si falla
    model_deberta = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_classes_deberta,
        use_auth_token=False
    ).to(device)

    model_deberta.config.id2label = id2label
    model_deberta.config.label2id = label2id

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_deberta',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_deberta',
    logging_steps=10,
    eval_strategy="epoch",  # Cambiado de evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
)

# Metrics function
def compute_metrics(eval_pred):
    from sklearn.metrics import precision_recall_fscore_support, accuracy_score

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Trainer
trainer = Trainer(
    model=model_deberta,
    args=training_args,
    train_dataset=train_dataset_deberta,
    eval_dataset=val_dataset_deberta,
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting DeBERTa model training...")
trainer.train()
print("Training finished.")

In [None]:
# Evaluate on the test set
print("Evaluating DeBERTa model on the test set...")
test_results_deberta = trainer.predict(test_tokenized_dataset)

# Print classification report
y_true_deberta = test_tokenized_dataset['labels']
y_pred_deberta = np.argmax(test_results_deberta.predictions, axis=-1)
emotion_names = list(label_encoder.classes_)

print("\\n--- Classification Report for DeBERTa ---")
print(classification_report(y_true_deberta, y_pred_deberta, target_names=emotion_names))

# Plot confusion matrix
cm_deberta = confusion_matrix(y_true_deberta, y_pred_deberta)
cm_normalized_deberta = cm_deberta.astype('float') / cm_deberta.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(cm_normalized_deberta, annot=True, fmt='.3f', cmap='Blues', xticklabels=emotion_names, yticklabels=emotion_names)
plt.title('Matriz de Confusión Normalizada - DeBERTa')
plt.xlabel('Predicción')
plt.ylabel('Valor Real')
plt.show()

In [None]:
# Perform per-event analysis for DeBERTa
test_results_df_deberta = test_df_cleaned.copy()
test_results_df_deberta['predicted_emotion'] = label_encoder.inverse_transform(y_pred_deberta)
test_results_df_deberta['correct_prediction'] = (test_results_df_deberta['emotion'] == test_results_df_deberta['predicted_emotion'])

event_performance_deberta = test_results_df_deberta.groupby('event')['correct_prediction'].agg(['count', 'sum', 'mean']).round(4)
event_performance_deberta.columns = ['total_tweets', 'correct_predictions', 'accuracy']
event_performance_deberta = event_performance_deberta.sort_values('accuracy', ascending=False)

significant_events_deberta = event_performance_deberta[event_performance_deberta['total_tweets'] >= 2]

if not significant_events_deberta.empty:
    print("=== RENDIMIENTO POR EVENTO (≥2 tweets) - DeBERTa ===")
    print(f"Eventos analizados: {len(significant_events_deberta)}")
    display(significant_events_deberta)
else:
    print("No hay eventos con 2 o más tweets en el conjunto de prueba para un análisis significativo.")

## 8. Modelo 3: Instruction-Tuning de un LLM (Flan-T5)

Para el tercer modelo, exploraremos el fine-tuning de un Large Language Model (LLM) que ha sido pre-entrenado con instrucciones. Usaremos `google/flan-t5-base`. La idea es formatear nuestras muestras de entrenamiento como instrucciones que el modelo debe seguir.

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset

LLM_MODEL_NAME = 'google/flan-t5-base'

# Load tokenizer and model
llm_tokenizer = T5Tokenizer.from_pretrained(LLM_MODEL_NAME)
llm_model = T5ForConditionalGeneration.from_pretrained(LLM_MODEL_NAME).to(device)

# PROBLEMA CORREGIDO: Prompt más simple y directo para T5
def create_instructional_dataset(df):
    instructions = []
    for _, row in df.iterrows():
        text = row['text_clean']
        emotion = row['emotion']
        # Prompt más simple y directo - T5 funciona mejor con prompts cortos
        prompt = f"Classify emotion: {text}"
        instructions.append({'text': prompt, 'label': emotion})
    return Dataset.from_list(instructions)

train_instruction_dataset = create_instructional_dataset(train_combined)
test_instruction_dataset = create_instructional_dataset(test_df_cleaned)

# PROBLEMA CORREGIDO: Tokenización mejorada y más robusta
def tokenize_t5(examples):
    # Tokenize input
    model_inputs = llm_tokenizer(
        examples['text'],
        max_length=64,
        padding=False,
        truncation=True,
        return_tensors=None  # Importante: no devolver tensors aquí
    )

    # Tokenize labels
    with llm_tokenizer.as_target_tokenizer():
        labels = llm_tokenizer(
            examples['label'],
            max_length=8,
            padding=False,
            truncation=True,
            return_tensors=None  # Importante: no devolver tensors aquí
        )

    # Asegurarse de que labels tenga la estructura correcta
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply tokenization
train_tokenized_t5 = train_instruction_dataset.map(
    tokenize_t5,
    batched=True,
    remove_columns=train_instruction_dataset.column_names  # Remover columnas originales
)

test_tokenized_t5 = test_instruction_dataset.map(
    tokenize_t5,
    batched=True,
    remove_columns=test_instruction_dataset.column_names  # Remover columnas originales
)

# Split train into train and validation
train_val_split_t5 = train_tokenized_t5.train_test_split(test_size=0.2, seed=42)
train_dataset_t5 = train_val_split_t5['train']
val_dataset_t5 = train_val_split_t5['test']

print("Data prepared for Flan-T5 model.")
print(f"Train dataset size: {len(train_dataset_t5)}")
print(f"Validation dataset size: {len(val_dataset_t5)}")
print(f"Test dataset size: {len(test_tokenized_t5)}")

# Verificar algunas muestras tokenizadas
print("\n--- Verificación de muestras tokenizadas ---")
for i in range(2):
    sample = train_dataset_t5[i]
    print(f"Ejemplo {i+1}:")
    print(f"Input IDs shape: {len(sample['input_ids'])}")
    print(f"Labels shape: {len(sample['labels'])}")
    print(f"Input text (decoded): {llm_tokenizer.decode(sample['input_ids'], skip_special_tokens=True)}")
    print(f"Label text (decoded): {llm_tokenizer.decode(sample['labels'], skip_special_tokens=True)}")
    print("-" * 50)

In [None]:
from transformers import TrainingArguments

training_args_t5 = TrainingArguments(
    output_dir='./results_t5',
    eval_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    push_to_hub=False,
    report_to=None,
    seed=42,
)

In [None]:
from transformers import DataCollatorForSeq2Seq, Trainer
import torch

# Data collator correcto con configuración mejorada
data_collator = DataCollatorForSeq2Seq(
    tokenizer=llm_tokenizer,
    model=llm_model,
    label_pad_token_id=-100,
    padding=True,
    return_tensors="pt"  # Asegurar que devuelve tensors de PyTorch
)

# Crear el trainer
trainer_t5 = Trainer(
    model=llm_model,
    args=training_args_t5,
    train_dataset=train_dataset_t5,
    eval_dataset=val_dataset_t5,
    data_collator=data_collator,
    tokenizer=llm_tokenizer
)

# Verificar que los datos están correctamente formateados antes del entrenamiento
print("Verificando formato de datos...")
sample_batch = [train_dataset_t5[i] for i in range(2)]
try:
    collated = data_collator(sample_batch)
    print("✓ Data collator funciona correctamente")
    print(f"Batch input_ids shape: {collated['input_ids'].shape}")
    print(f"Batch labels shape: {collated['labels'].shape}")
except Exception as e:
    print(f"✗ Error en data collator: {e}")
    raise e

# PROBLEMA PRINCIPAL: ¡FALTABA ENTRENAR EL MODELO!
print("Iniciando entrenamiento del modelo Flan-T5...")
trainer_t5.train()

print("Entrenamiento completado. Guardando modelo...")
trainer_t5.save_model('./flan_t5_emotion_classifier')
print("Modelo guardado exitosamente.")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch

def evaluate_t5_model():
    print("Evaluando modelo Flan-T5 en el conjunto de prueba...")

    # Asegurar que el modelo esté en modo evaluación
    llm_model.eval()

    # Listas para almacenar predicciones y etiquetas reales
    all_predictions = []
    all_true_labels = []

    # Definir emociones válidas
    valid_emotions = ['anger', 'sadness', 'joy', 'disgust', 'fear', 'surprise', 'offensive', 'other']

    # Función para limpiar y mapear predicciones
    def clean_prediction(pred_text):
        pred_clean = pred_text.strip().lower()
        # Si la predicción exacta está en las emociones válidas, usarla
        if pred_clean in valid_emotions:
            return pred_clean
        # Si no, buscar si alguna emoción válida está contenida en la predicción
        for emotion in valid_emotions:
            if emotion in pred_clean:
                return emotion
        # Si no se encuentra nada, clasificar como 'other'
        return 'other'

    # Procesar muestras en lotes pequeños
    batch_size = 16
    total_samples = len(test_instruction_dataset)

    for i in range(0, total_samples, batch_size):
        end_idx = min(i + batch_size, total_samples)
        batch_texts = []
        batch_labels = []

        # Preparar el batch
        for j in range(i, end_idx):
            batch_texts.append(test_instruction_dataset[j]['text'])
            batch_labels.append(test_instruction_dataset[j]['label'])

        try:
            # Tokenizar el batch
            inputs = llm_tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=64
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Generar predicciones
            with torch.no_grad():
                outputs = llm_model.generate(
                    **inputs,
                    max_length=8,
                    num_beams=2,
                    do_sample=False,
                    early_stopping=True,
                    pad_token_id=llm_tokenizer.pad_token_id,
                    eos_token_id=llm_tokenizer.eos_token_id
                )

            # Decodificar predicciones
            batch_predictions = llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)

            # Procesar predicciones
            for pred_raw in batch_predictions:
                cleaned_pred = clean_prediction(pred_raw)
                all_predictions.append(cleaned_pred)

            # Agregar etiquetas verdaderas
            all_true_labels.extend(batch_labels)

            # Mostrar progreso
            if (end_idx) % 100 == 0 or end_idx == total_samples:
                print(f"Procesado {end_idx}/{total_samples} muestras...")

        except Exception as e:
            print(f"Error en batch {i//batch_size + 1}: {e}")
            # En caso de error, agregar predicciones por defecto
            for _ in range(len(batch_texts)):
                all_predictions.append('other')
            all_true_labels.extend(batch_labels)
            continue

    # Verificar que las longitudes coincidan
    if len(all_predictions) != len(all_true_labels):
        min_len = min(len(all_predictions), len(all_true_labels))
        all_predictions = all_predictions[:min_len]
        all_true_labels = all_true_labels[:min_len]
        print(f"Ajustadas longitudes a {min_len} muestras")

    # Mostrar algunas predicciones de ejemplo
    print(f"\n--- Ejemplos de predicciones ---")
    for i in range(min(10, len(all_predictions))):
        print(f"Texto: {test_instruction_dataset[i]['text'][:100]}...")
        print(f"Real: {all_true_labels[i]} | Predicción: {all_predictions[i]}")
        print("-" * 50)

    # Calcular métricas
    print(f"\n--- Reporte de Clasificación para Flan-T5 ---")
    print(f"Total de muestras procesadas: {len(all_true_labels)}")

    # Verificar distribución de predicciones
    from collections import Counter
    pred_dist = Counter(all_predictions)
    true_dist = Counter(all_true_labels)

    print(f"\nDistribución de predicciones: {dict(pred_dist)}")
    print(f"Distribución de etiquetas reales: {dict(true_dist)}")

    # Reporte de clasificación
    print("\n" + "="*60)
    report = classification_report(
        all_true_labels,
        all_predictions,
        labels=valid_emotions,
        target_names=valid_emotions,
        zero_division=0
    )
    print(report)

    # Matriz de confusión
    try:
        cm = confusion_matrix(all_true_labels, all_predictions, labels=valid_emotions)

        # Normalizar matriz de confusión
        cm_normalized = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + 1e-8)

        # Crear gráfico
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm_normalized,
                   annot=True,
                   fmt='.3f',
                   cmap='Blues',
                   xticklabels=valid_emotions,
                   yticklabels=valid_emotions)
        plt.title('Matriz de Confusión Normalizada - Flan-T5')
        plt.xlabel('Predicción')
        plt.ylabel('Etiqueta Real')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Error al crear matriz de confusión: {e}")

    return all_predictions, all_true_labels

# Ejecutar evaluación
predictions, true_labels = evaluate_t5_model()

In [None]:
import torch
from transformers import pipeline
import numpy as np
from tqdm import tqdm

print("Generando predicciones con el modelo Flan-T5 entrenado...")

# Cargar el modelo entrenado
trained_model_path = './flan_t5_emotion_classifier'

try:
    # Intentar cargar el modelo entrenado
    trained_model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
    trained_tokenizer = T5Tokenizer.from_pretrained(trained_model_path)
    print("✓ Modelo entrenado cargado exitosamente")
except:
    # Si no se puede cargar, usar el modelo del trainer
    trained_model = trainer_t5.model
    trained_tokenizer = llm_tokenizer
    print("✓ Usando modelo del trainer actual")

# Mover modelo a dispositivo
trained_model = trained_model.to(device)
trained_model.eval()

def predict_emotion_t5(text, model, tokenizer, max_length=64):
    """Predice la emoción de un texto usando el modelo T5 entrenado"""
    # Crear el prompt igual que en entrenamiento
    prompt = f"Classify emotion: {text}"

    # Tokenizar
    inputs = tokenizer(
        prompt,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generar predicción
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=8,  # Longitud máxima para las emociones
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decodificar la predicción
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.strip()

# Generar predicciones para el conjunto de prueba
print("Generando predicciones para el conjunto de prueba...")
final_predictions_t5 = []

# Usar tqdm para mostrar progreso
for i, row in tqdm(test_df_cleaned.iterrows(), total=len(test_df_cleaned), desc="Predicciones"):
    try:
        prediction = predict_emotion_t5(row['text_clean'], trained_model, trained_tokenizer)
        final_predictions_t5.append(prediction)
    except Exception as e:
        print(f"Error en predicción {i}: {e}")
        # En caso de error, usar una predicción por defecto
        final_predictions_t5.append("neutral")

print(f"✓ Predicciones completadas: {len(final_predictions_t5)}")

# Verificar algunas predicciones
print("\n--- Verificación de predicciones ---")
for i in range(min(5, len(test_df_cleaned))):
    print(f"Texto: {test_df_cleaned.iloc[i]['text_clean'][:100]}...")
    print(f"Emoción real: {test_df_cleaned.iloc[i]['emotion']}")
    print(f"Predicción: {final_predictions_t5[i]}")
    print("-" * 50)

# Estadísticas básicas de las predicciones
from collections import Counter
prediction_counts = Counter(final_predictions_t5)
print("\n--- Distribución de predicciones ---")
for emotion, count in prediction_counts.most_common():
    print(f"{emotion}: {count} ({count/len(final_predictions_t5)*100:.1f}%)")

print(f"\nPredicciones únicas encontradas: {len(prediction_counts)}")
print("¡Predicciones generadas exitosamente! Ahora puedes ejecutar el análisis por eventos.")

In [None]:
# Perform per-event analysis for Flan-T5
test_results_df_t5 = test_df_cleaned.copy()
test_results_df_t5['predicted_emotion'] = final_predictions_t5
test_results_df_t5['correct_prediction'] = (test_results_df_t5['emotion'] == test_results_df_t5['predicted_emotion'])

event_performance_t5 = test_results_df_t5.groupby('event')['correct_prediction'].agg(['count', 'sum', 'mean']).round(4)
event_performance_t5.columns = ['total_tweets', 'correct_predictions', 'accuracy']
event_performance_t5 = event_performance_t5.sort_values('accuracy', ascending=False)

significant_events_t5 = event_performance_t5[event_performance_t5['total_tweets'] >= 2]

if not significant_events_t5.empty:
    print("=== RENDIMIENTO POR EVENTO (≥2 tweets) - Flan-T5 ===")
    print(f"Eventos analizados: {len(significant_events_t5)}")
    display(significant_events_t5)
else:
    print("No hay eventos con 2 o más tweets en el conjunto de prueba para un análisis significativo.")

## 9. Evaluación de los modelos