In [3]:
import polars as pl
import numpy as np
import spacy

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}

In [5]:
df_train = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['train'])
df_test = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['test'])

In [6]:
nlp = spacy.load('en_core_web_lg')

In [8]:
docs_train = list(nlp.pipe(df_train.select('text').to_series().to_list()))
docs_test = list(nlp.pipe(df_test.select('text').to_series().to_list()))

In [7]:
def preprocess_text(doc):
    return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

In [8]:
processed_texts_train = [' '.join(preprocess_text(doc)) for doc in docs_train]
processed_texts_test = [' '.join(preprocess_text(doc)) for doc in docs_test]

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

In [10]:
y_train_raw = df_train['labels'].to_list()
y_test_raw = df_test['labels'].to_list()

all_labels_combined = y_train_raw + y_test_raw
all_unique_label_ids = sorted(list(set(item for sublist in all_labels_combined for item in sublist)))
mlb = MultiLabelBinarizer(classes=all_unique_label_ids)
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)
y_test_multilabel = mlb.transform(y_test_raw)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(max_features=5000)

In [13]:
x_train = tfidf.fit_transform(processed_texts_train)
x_test = tfidf.transform(processed_texts_test)

In [84]:
from sklearn.linear_model import LogisticRegression

In [85]:
lreg = LogisticRegression(max_iter=1000, multi_class='multinomial', class_weight = 'balanced')

In [86]:
y_train_multilabel.shape

(43410, 9)

In [20]:
from sklearn.multiclass import OneVsRestClassifier

In [88]:
model = OneVsRestClassifier(lreg)

In [89]:
model.fit(x_train, y_train_multilabel)



In [90]:
y_pred = model.predict(x_test)

In [91]:
from sklearn.metrics import classification_report

In [92]:
print(classification_report(y_test_multilabel, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1650
           1       0.19      0.54      0.28       474
           2       0.34      0.74      0.47        98
           3       0.26      0.57      0.35       677
           4       0.31      0.64      0.42       379
           5       0.53      0.78      0.63      1787
           6       0.17      0.54      0.26        83
           7       0.33      0.63      0.43       726
           8       0.23      0.66      0.34       123

   micro avg       0.42      0.71      0.53      5997
   macro avg       0.35      0.66      0.44      5997
weighted avg       0.49      0.71      0.57      5997
 samples avg       0.49      0.72      0.56      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


class weight

In [110]:
from sklearn.utils.class_weight import compute_class_weight

# Calcular los pesos por clase para cada etiqueta
class_weights_list = []
for i in range(y_train_multilabel.shape[1]):
    # Etiquetas binarias para la i-ésima clase
    y_i = y_train_multilabel[:, i]
    classes = np.unique(y_i)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_i)
    class_weights_list.append(dict(zip(classes, weights)))


In [111]:
from sklearn.base import clone

base_estimator = LogisticRegression(max_iter=1000)
estimators = []

for i, class_weight in enumerate(class_weights_list):
    clf = clone(base_estimator)
    clf.set_params(class_weight=class_weight)
    clf.fit(x_train, y_train_multilabel[:, i])
    estimators.append(clf)


In [112]:
y_pred_weighted = np.column_stack([clf.predict(x_test) for clf in estimators])

In [113]:
from sklearn.metrics import classification_report
print(classification_report(y_test_multilabel, y_pred_weighted))


              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1650
           1       0.20      0.56      0.29       474
           2       0.35      0.74      0.48        98
           3       0.27      0.59      0.37       677
           4       0.32      0.64      0.43       379
           5       0.53      0.80      0.64      1787
           6       0.17      0.58      0.26        83
           7       0.32      0.63      0.43       726
           8       0.23      0.67      0.34       123

   micro avg       0.42      0.72      0.53      5997
   macro avg       0.35      0.67      0.45      5997
weighted avg       0.50      0.72      0.57      5997
 samples avg       0.50      0.73      0.56      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Weighted Loss Function with pytorch

In [120]:
import torch

# Calcular el número de ocurrencias por clase
counts = np.sum(y_train_multilabel, axis=0)  # shape: (num_classes,)
total = y_train_multilabel.shape[0]

# Evitar divisiones por 0
counts[counts == 0] = 1

# pos_weight formula: inverse frequency
pos_weight = torch.tensor((total - counts) / counts, dtype=torch.float32)


In [121]:
import torch.nn as nn
import torch.nn.functional as F

class MultiLabelNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiLabelNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # logits (no sigmoid)


In [122]:
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(x_train.toarray(), dtype=torch.float32)
Y_tensor = torch.tensor(y_train_multilabel, dtype=torch.float32)

train_dataset = TensorDataset(X_tensor, Y_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [127]:
model = MultiLabelNN(input_dim=5000, output_dim=y_train_multilabel.shape[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(35):
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} | Loss: {loss.item():.4f}")


Epoch 0 | Loss: 0.6476
Epoch 1 | Loss: 0.7809
Epoch 2 | Loss: 0.4431
Epoch 3 | Loss: 1.3318
Epoch 4 | Loss: 0.3474
Epoch 5 | Loss: 0.3087
Epoch 6 | Loss: 0.4719
Epoch 7 | Loss: 0.3746
Epoch 8 | Loss: 0.3212
Epoch 9 | Loss: 0.3668
Epoch 10 | Loss: 0.2963
Epoch 11 | Loss: 0.2472
Epoch 12 | Loss: 0.1878
Epoch 13 | Loss: 0.2575
Epoch 14 | Loss: 0.1519
Epoch 15 | Loss: 0.1541
Epoch 16 | Loss: 0.1967
Epoch 17 | Loss: 0.0991
Epoch 18 | Loss: 0.0627
Epoch 19 | Loss: 0.0570
Epoch 20 | Loss: 0.0767
Epoch 21 | Loss: 0.1150
Epoch 22 | Loss: 0.0551
Epoch 23 | Loss: 0.0728
Epoch 24 | Loss: 0.0153
Epoch 25 | Loss: 0.0317
Epoch 26 | Loss: 0.0140
Epoch 27 | Loss: 0.0116
Epoch 28 | Loss: 0.0289
Epoch 29 | Loss: 0.0044
Epoch 30 | Loss: 0.0395
Epoch 31 | Loss: 0.0525
Epoch 32 | Loss: 0.0413
Epoch 33 | Loss: 0.0768
Epoch 34 | Loss: 0.0352


In [128]:
model.eval()
with torch.no_grad():
    test_tensor = torch.tensor(x_test.toarray(), dtype=torch.float32).to(device)
    logits = model(test_tensor)
    probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs >= 0.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test_multilabel, preds))


              precision    recall  f1-score   support

           0       0.77      0.73      0.75      1650
           1       0.25      0.27      0.26       474
           2       0.57      0.50      0.53        98
           3       0.30      0.34      0.32       677
           4       0.41      0.47      0.44       379
           5       0.52      0.57      0.55      1787
           6       0.29      0.22      0.25        83
           7       0.40      0.33      0.36       726
           8       0.40      0.35      0.37       123

   micro avg       0.51      0.52      0.52      5997
   macro avg       0.43      0.42      0.43      5997
weighted avg       0.52      0.52      0.52      5997
 samples avg       0.48      0.53      0.49      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


focal loss

In [129]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, gamma=2, pos_weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        # BCE with logits
        bce_loss = F.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none', pos_weight=self.pos_weight
        )
        # Probabilities
        probs = torch.sigmoid(inputs)
        pt = torch.where(targets == 1, probs, 1 - probs)
        focal_weight = (1 - pt) ** self.gamma

        loss = focal_weight * bce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        return loss


In [130]:
focal_criterion = FocalLoss(gamma=2, pos_weight=pos_weight.to(device)).to(device)

In [131]:
for epoch in range(35):  # o el número que decidas
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = focal_criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")


Epoch 1 | Loss: 0.0140
Epoch 2 | Loss: 0.0070
Epoch 3 | Loss: 0.0047
Epoch 4 | Loss: 0.0085
Epoch 5 | Loss: 0.0000
Epoch 6 | Loss: 0.0161
Epoch 7 | Loss: 0.0258
Epoch 8 | Loss: 0.0156
Epoch 9 | Loss: 0.0413
Epoch 10 | Loss: 0.0163
Epoch 11 | Loss: 0.0354
Epoch 12 | Loss: 0.0773
Epoch 13 | Loss: 0.0074
Epoch 14 | Loss: 0.0260
Epoch 15 | Loss: 0.0090
Epoch 16 | Loss: 0.0492
Epoch 17 | Loss: 0.0168
Epoch 18 | Loss: 0.0129
Epoch 19 | Loss: 0.0090
Epoch 20 | Loss: 0.0145
Epoch 21 | Loss: 0.0002
Epoch 22 | Loss: 0.0163
Epoch 23 | Loss: 0.0001
Epoch 24 | Loss: 0.0239
Epoch 25 | Loss: 0.0424
Epoch 26 | Loss: 0.0527
Epoch 27 | Loss: 0.0108
Epoch 28 | Loss: 0.0180
Epoch 29 | Loss: 0.0001
Epoch 30 | Loss: 0.0149
Epoch 31 | Loss: 0.0264
Epoch 32 | Loss: 0.0066
Epoch 33 | Loss: 0.0100
Epoch 34 | Loss: 0.0457
Epoch 35 | Loss: 0.0027


In [132]:
model.eval()
with torch.no_grad():
    test_tensor = torch.tensor(x_test.toarray(), dtype=torch.float32).to(device)
    logits = model(test_tensor)
    probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs >= 0.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test_multilabel, preds))


              precision    recall  f1-score   support

           0       0.74      0.75      0.74      1650
           1       0.24      0.24      0.24       474
           2       0.41      0.50      0.45        98
           3       0.29      0.30      0.29       677
           4       0.42      0.42      0.42       379
           5       0.52      0.56      0.54      1787
           6       0.21      0.19      0.20        83
           7       0.42      0.31      0.36       726
           8       0.40      0.33      0.36       123

   micro avg       0.51      0.51      0.51      5997
   macro avg       0.41      0.40      0.40      5997
weighted avg       0.51      0.51      0.51      5997
 samples avg       0.46      0.52      0.47      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


update modeldef improved_preprocess_text(doc):
    return [
        token.lemma_.lower() for token in doc 
        if not token.is_stop 
        and not token.is_punct 
        and not token.is_space
        and token.is_alpha  # Solo palabras alfabéticas
        and len(token.text) > 2  # Eliminar palabras muy cortas
    ]

In [7]:
def improved_preprocess_text(doc):
    return [
        token.lemma_.lower() for token in doc 
        if not token.is_stop 
        and not token.is_punct 
        and not token.is_space
        and token.is_alpha  # Solo palabras alfabéticas
        and len(token.text) > 2  # Eliminar palabras muy cortas
    ]

In [9]:
processed_texts_train = [' '.join(improved_preprocess_text(doc)) for doc in docs_train]
processed_texts_test = [' '.join(improved_preprocess_text(doc)) for doc in docs_test]

In [13]:
y_train_raw = df_train['labels'].to_list()
y_test_raw = df_test['labels'].to_list()

all_labels_combined = y_train_raw + y_test_raw
all_unique_label_ids = sorted(list(set(item for sublist in all_labels_combined for item in sublist)))
mlb = MultiLabelBinarizer(classes=all_unique_label_ids)
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)
y_test_multilabel = mlb.transform(y_test_raw)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),  # Incluir bigramas
    min_df=3,  # Ignorar términos muy raros
    max_df=0.8  # Ignorar términos muy comunes
)

In [16]:
x_train = tfidf.fit_transform(processed_texts_train)
x_test = tfidf.transform(processed_texts_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced_subsample',  # Ponderación por clase
    max_depth=10,
    n_jobs=-1
)

model = MultiOutputClassifier(rf)
model.fit(x_train, y_train_multilabel)

In [18]:
from sklearn.calibration import calibration_curve

# Encontrar umbrales óptimos por clase
optimal_thresholds = []
for i in range(y_train_multilabel.shape[1]):
    prob_true, prob_pred = calibration_curve(
        y_train_multilabel[:, i], 
        model.predict_proba(x_train)[i][:, 1],
        n_bins=10
    )
    optimal_thresholds.append(prob_pred[np.argmax(prob_true >= 0.5)])

# Aplicar umbrales óptimos
y_probs = model.predict_proba(x_test)
y_pred_optimal = np.array([
    (y_probs[i][:, 1] > optimal_thresholds[i]).astype(int) 
    for i in range(len(optimal_thresholds))
]).T

In [35]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
# Crear múltiples modelos con diferentes enfoques
model1 = OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))
model2 = MultiOutputClassifier(RandomForestClassifier(class_weight='balanced_subsample'))
model3 = MultiOutputClassifier(HistGradientBoostingClassifier())


# Ensemble voting
ensemble = VotingClassifier(
    estimators=[
        ('lr', model1),
        ('rf', model2),
        ('gb', model3)
    ],
    voting='soft'
)

In [37]:
multi_label_ensemble = MultiOutputClassifier(ensemble)


In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Binarizar las etiquetas multilabel
mlb = MultiLabelBinarizer()
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)  # 2D array
y_test_multilabel = mlb.transform(y_test_raw)    # 2D array

# Clasificadores base
clf1 = LogisticRegression(class_weight='balanced', max_iter=1000)
clf2 = RandomForestClassifier(class_weight='balanced_subsample')
clf3 = HistGradientBoostingClassifier()

# Voting ensemble para una sola etiqueta
voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf1),
        ('rf', clf2),
        ('gb', clf3)
    ],
    voting='soft'
)

# MultiOutputClassifier se encargará de entrenar un VotingClassifier por etiqueta
multi_label_ensemble = MultiOutputClassifier(voting_clf)
if hasattr(x_train, 'toarray'):
    x_train = x_train.toarray()
    x_test = x_test.toarray()
# Entrenar
multi_label_ensemble.fit(x_train, y_train_multilabel)

# Predecir
y_pred = multi_label_ensemble.predict(x_test)

# Evaluar
print(classification_report(y_test_multilabel, y_pred, target_names=mlb.classes_))


In [33]:
y_pred = multi_label_ensemble.predict(X_test)


NameError: name 'multi_label_ensemble' is not defined