In [1]:
import polars as pl
import numpy as np
import spacy

In [2]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}

In [3]:
df_train = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['train'])
df_test = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['test'])

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
docs_train = list(nlp.pipe(df_train.select('text').to_series().to_list()))
docs_test = list(nlp.pipe(df_test.select('text').to_series().to_list()))

In [6]:
def preprocess_text(doc):
    return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

In [7]:
processed_texts_train = [' '.join(preprocess_text(doc)) for doc in docs_train]
processed_texts_test = [' '.join(preprocess_text(doc)) for doc in docs_test]

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

In [22]:
y_train_raw = df_train['labels'].to_list()
y_test_raw = df_test['labels'].to_list()

all_labels_combined = y_train_raw + y_test_raw
all_unique_label_ids = sorted(list(set(item for sublist in all_labels_combined for item in sublist)))
mlb = MultiLabelBinarizer(classes=all_unique_label_ids)
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)
y_test_multilabel = mlb.transform(y_test_raw)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfidf = TfidfVectorizer(max_features=5000)

In [25]:
x_train = tfidf.fit_transform(processed_texts_train)
x_test = tfidf.transform(processed_texts_test)

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiLabelNN(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.5):
        super(MultiLabelNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, output_dim)
        self.dropout_rate = nn.Dropout(dropout_rate)        
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout_rate(x)
        x = F.relu(self.fc2(x))
        x = self.dropout_rate(x)
        return self.fc3(x)

In [27]:
model = MultiLabelNN(input_dim=5000, output_dim=y_train_multilabel.shape[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

MultiLabelNN(
  (fc1): Linear(in_features=5000, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=9, bias=True)
  (dropout_rate): Dropout(p=0.5, inplace=False)
)

In [28]:
len(y_train_raw)

43410

In [29]:
counts = np.sum(y_train_multilabel, axis=0) 
total = y_train_multilabel.shape[0]

counts[counts == 0] = 1
pos_weight = torch.tensor((total - counts) / counts, dtype=torch.float32)

In [30]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, pos_weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        # BCE with logits
        bce_loss = F.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none', pos_weight=self.pos_weight
        )
        # Probabilities
        probs = torch.sigmoid(inputs)
        pt = torch.where(targets == 1, probs, 1 - probs)
        focal_weight = (1 - pt) ** self.gamma

        loss = focal_weight * bce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        return loss

In [31]:
focal_criterion = FocalLoss(gamma=2, pos_weight=pos_weight.to(device)).to(device)

In [32]:
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(x_train.toarray(), dtype=torch.float32)
Y_tensor = torch.tensor(y_train_multilabel, dtype=torch.float32)

train_dataset = TensorDataset(X_tensor, Y_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5): 
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = focal_criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

Epoch 1 | Loss: 0.2369
Epoch 2 | Loss: 0.1910
Epoch 3 | Loss: 0.2003
Epoch 4 | Loss: 0.1248
Epoch 5 | Loss: 0.1162


In [34]:
model.eval()
with torch.no_grad():
    test_tensor = torch.tensor(x_test.toarray(), dtype=torch.float32).to(device)
    logits = model(test_tensor)
    probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs >= 0.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test_multilabel, preds))

              precision    recall  f1-score   support

           0       0.76      0.80      0.78      1650
           1       0.20      0.58      0.29       474
           2       0.27      0.76      0.39        98
           3       0.24      0.60      0.35       677
           4       0.27      0.67      0.39       379
           5       0.53      0.79      0.63      1787
           6       0.16      0.54      0.25        83
           7       0.28      0.66      0.39       726
           8       0.13      0.65      0.22       123

   micro avg       0.38      0.73      0.50      5997
   macro avg       0.32      0.67      0.41      5997
weighted avg       0.47      0.73      0.55      5997
 samples avg       0.46      0.74      0.54      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Optuna

In [28]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[K     |████████████████████████████████| 395 kB 6.1 MB/s eta 0:00:01
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.16.2-py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 85.8 MB/s eta 0:00:01
Collecting sqlalchemy>=1.4.2
  Downloading sqlalchemy-2.0.41-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 61.4 MB/s eta 0:00:01
Collecting colorlog
  Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Collecting Mako
  Downloading mako-1.3.10-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 9.0 MB/s  eta 0:00:01
Collecting greenlet>=1
  Downloading greenlet-3.2.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (580 kB)
[K     |████████████████████████████████| 580 kB 72.8 MB/s eta 0:00:01
Installing collected packages: greenlet, sqlalchemy, Mako, colorlog, alembic, optuna
Successfully

In [64]:
import optuna
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import f1_score

In [49]:
train_dataset = TensorDataset(X_tensor, Y_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

X_tensor_test = torch.tensor(x_test.toarray(), dtype=torch.float32)
Y_tensor_test = torch.tensor(y_test_multilabel, dtype=torch.float32)
val_dataset = TensorDataset(X_tensor_test, Y_tensor_test)

In [65]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-1)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64])
    early_stop_patience = trial.suggest_int('early_stop_patience', 2, 6)
    max_length = trial.suggest_categorical('max_length', [32, 64, 128, 256])
    gradient_clip = trial.suggest_uniform('gradient_clip', 0.5, 5.0)
    eps = trial.suggest_loguniform('eps', 1e-10, 1e-6)
    correct_bias = trial.suggest_categorical('correct_bias', [True, False])
    
    focal_gamma = trial.suggest_float('focal_gamma', 0.5, 5.0) 
    pos_weight = trial.suggest_categorical('pos_weight', [None, 1.0, 2.0, 5.0])
        
    model = MultiLabelNN(input_dim=5000, output_dim=y_train_multilabel.shape[1], dropout_rate=dropout_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay, eps=eps)
    
    criterion = FocalLoss(gamma=focal_gamma, pos_weight=pos_weight)
    
    X_tensor = torch.tensor(x_train.toarray(), dtype=torch.float32)
    Y_tensor = torch.tensor(y_train_multilabel, dtype=torch.float32)
    train_dataset = TensorDataset(X_tensor, Y_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    model.train()
    
    for epoch in range(5):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = focal_criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    model.eval()
    all_val_predictions = []
    all_val_labels = []
    with torch.no_grad():
        for data, labels in val_loader:
            outputs = model(data)
            probabilities = torch.sigmoid(outputs)
            predicted = (probabilities > 0.5).int()
            all_val_predictions.extend(predicted.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    # Convert lists to numpy arrays for sklearn metrics
    y_true = np.array(all_val_labels)
    y_pred = np.array(all_val_predictions)

    # Choose an appropriate metric to maximize
    # For multi-label with imbalance, macro F1 is often preferred.
    macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    return macro_f1    

In [66]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)

[I 2025-07-08 00:13:48,239] A new study created in memory with name: no-name-33fbc557-f52c-4c9f-8241-09c3a4a648bc
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-1)
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
  gradient_clip = trial.suggest_uniform('gradient_clip', 0.5, 5.0)
  eps = trial.suggest_loguniform('eps', 1e-10, 1e-6)
[I 2025-07-08 00:14:52,572] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.00012239584462084627, 'weight_decay': 0.005034300558684191, 'dropout_rate': 0.21217268470157413, 'batch_size': 64, 'early_stop_patience': 6, 'max_length': 32, 'gradient_clip': 3.8686292419697765, 'eps': 9.744377779783682e-09, 'correct_bias': True, 'focal_gamma': 2.0421382548477567, 'pos_weight': 2.0}. Best is trial 0 with value: 0.0.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
  weight_decay = trial.suggest_loguniform('weight_deca

In [67]:
print("Mejores hiperparámetros:", study.best_params)
print("Mejor valor de la métrica:", study.best_value)

Mejores hiperparámetros: {'learning_rate': 0.00012239584462084627, 'weight_decay': 0.005034300558684191, 'dropout_rate': 0.21217268470157413, 'batch_size': 64, 'early_stop_patience': 6, 'max_length': 32, 'gradient_clip': 3.8686292419697765, 'eps': 9.744377779783682e-09, 'correct_bias': True, 'focal_gamma': 2.0421382548477567, 'pos_weight': 2.0}
Mejor valor de la métrica: 0.0


In [68]:
best_params ={'learning_rate': 0.00012239584462084627, 'weight_decay': 0.005034300558684191, 'dropout_rate': 0.21217268470157413, 'batch_size': 64, 'early_stop_patience': 6, 'max_length': 32, 'gradient_clip': 3.8686292419697765, 'eps': 9.744377779783682e-09, 'correct_bias': True, 'focal_gamma': 2.0421382548477567, 'pos_weight': 2.0}

In [69]:
model = MultiLabelNN(input_dim=5000, output_dim=y_train_multilabel.shape[1], dropout_rate=best_params['dropout_rate'])

In [70]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=best_params['learning_rate'],
    weight_decay=best_params['weight_decay'],
    eps=best_params['eps']
)

In [72]:
criterion = FocalLoss(gamma=best_params['focal_gamma'])

In [73]:
train_loader = DataLoader(
    train_dataset, 
    batch_size=best_params['batch_size'], 
    shuffle=True
)

In [74]:
val_loader = DataLoader(
    val_dataset, 
    batch_size=best_params['batch_size'], 
    shuffle=False
)

In [75]:
model.train()
for epoch in range(int(best_params['eps'])+1):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), 
            best_params['gradient_clip']
        )
        
        optimizer.step()

In [76]:
model.eval()

MultiLabelNN(
  (fc1): Linear(in_features=5000, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=9, bias=True)
  (dropout_rate): Dropout(p=0.21217268470157413, inplace=False)
)

In [77]:
all_predictions = []
all_labels = []
with torch.no_grad():
    for data, labels in val_loader:
        outputs = model(data)
        
        probabilities = torch.sigmoid(outputs)
        predicted = (probabilities > 0.5).int()

        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
print(classification_report(np.array(all_labels), np.array(all_predictions)))

              precision    recall  f1-score   support

           0       0.98      0.07      0.13      1650
           1       0.00      0.00      0.00       474
           2       0.00      0.00      0.00        98
           3       0.00      0.00      0.00       677
           4       0.00      0.00      0.00       379
           5       0.00      0.00      0.00      1787
           6       0.00      0.00      0.00        83
           7       0.00      0.00      0.00       726
           8       0.00      0.00      0.00       123

   micro avg       0.98      0.02      0.04      5997
   macro avg       0.11      0.01      0.01      5997
weighted avg       0.27      0.02      0.04      5997
 samples avg       0.02      0.02      0.02      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
