In [4]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

In [5]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}

In [6]:
df_train = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['train'])
df_test = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['test'])

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [8]:
def decontract(sentence):
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def removePunctuation(sentence):
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    return sentence

def removeNumber(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    return stopwords.sub("", sentence)
def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

In [9]:
x_train, y_train = df_train.select('text').to_series().to_list(), df_train.select('labels').to_series().to_list()
x_test, y_test = df_test.select('text').to_series().to_list(), df_test.select('labels').to_series().to_list()

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

In [12]:
tfidf.fit(x_train)

In [13]:
x_train = tfidf.transform(x_train)
x_test = tfidf.transform(x_test)

In [14]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score

In [15]:
y_train_raw = df_train['labels'].to_list()
y_test_raw = df_test['labels'].to_list()

all_labels_combined = y_train_raw + y_test_raw
all_unique_label_ids = sorted(list(set(item for sublist in all_labels_combined for item in sublist)))

In [16]:
mlb = MultiLabelBinarizer(classes=all_unique_label_ids)
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)
y_test_multilabel = mlb.transform(y_test_raw)

In [14]:
y_train_multilabel

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [15]:
x_train[0].shape

(1, 605291)

# Random Forest

In [24]:
model = RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state = 42)

In [None]:
model.fit(x_train, y_train_multilabel)

In [20]:
y_pred = model.predict(x_test)

In [24]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_test_multilabel, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.54      0.67      1650
           1       0.55      0.04      0.07       474
           2       0.12      0.04      0.06        98
           3       0.58      0.02      0.04       677
           4       0.91      0.11      0.19       379
           5       0.59      0.46      0.52      1787
           6       0.88      0.08      0.15        83
           7       0.76      0.05      0.10       726
           8       0.64      0.07      0.13       123

   micro avg       0.71      0.31      0.43      5997
   macro avg       0.66      0.16      0.21      5997
weighted avg       0.70      0.31      0.38      5997
 samples avg       0.34      0.32      0.33      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Random Forest with Optuna

In [20]:
%pip install optuna



In [21]:
import optuna

In [39]:
def objective(trial):
  params = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 500),
      'max_depth': trial.suggest_int('max_depth', 5, 30),
      'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
      'class_weight': 'balanced'
  }
  model = RandomForestClassifier(**params)
  score = cross_val_score(model, x_train, y_train_multilabel, scoring='f1_micro', cv=3).mean()
  return score

In [40]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 5)

[I 2025-07-08 02:32:48,641] A new study created in memory with name: no-name-50f593c2-d53e-4efb-bf0a-55511c9052dc
[I 2025-07-08 02:43:51,459] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 472, 'max_depth': 22, 'min_samples_split': 5}. Best is trial 0 with value: 0.0.
[I 2025-07-08 02:46:08,774] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 149, 'max_depth': 15, 'min_samples_split': 6}. Best is trial 0 with value: 0.0.
[I 2025-07-08 02:49:57,658] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 311, 'max_depth': 12, 'min_samples_split': 6}. Best is trial 0 with value: 0.0.
[I 2025-07-08 02:56:44,827] Trial 3 finished with value: 0.0 and parameters: {'n_estimators': 493, 'max_depth': 14, 'min_samples_split': 8}. Best is trial 0 with value: 0.0.
[I 2025-07-08 03:00:56,030] Trial 4 finished with value: 0.0 and parameters: {'n_estimators': 451, 'max_depth': 9, 'min_samples_split': 7}. Best is trial 0 with value: 0.0.


In [41]:
print("Mejores hiperparámetros:", study.best_params)
print("Mejor valor de la métrica:", study.best_value)

Mejores hiperparámetros: {'n_estimators': 472, 'max_depth': 22, 'min_samples_split': 5}
Mejor valor de la métrica: 0.0


In [42]:
best_params = {'n_estimators': 472, 'max_depth': 22, 'min_samples_split': 5}

In [51]:
model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'])

In [54]:
model.fit(x_train, y_train_multilabel)

In [55]:
y_pred = model.predict(x_test)

In [58]:
print(classification_report(y_test_multilabel, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1650
           1       0.00      0.00      0.00       474
           2       0.00      0.00      0.00        98
           3       0.00      0.00      0.00       677
           4       0.00      0.00      0.00       379
           5       0.00      0.00      0.00      1787
           6       0.00      0.00      0.00        83
           7       0.00      0.00      0.00       726
           8       0.00      0.00      0.00       123

   micro avg       0.00      0.00      0.00      5997
   macro avg       0.00      0.00      0.00      5997
weighted avg       0.00      0.00      0.00      5997
 samples avg       0.00      0.00      0.00      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k=2000)
X_train_selected = selector.fit_transform(x_train, y_train_multilabel)

In [61]:
from sklearn.multioutput import MultiOutputClassifier
model = MultiOutputClassifier(RandomForestClassifier(class_weight='balanced'))

In [None]:
model.fit(x_train, y_train_multilabel)

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(classification_report(y_test_multilabel, y_pred))

# Log Reg

In [17]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [18]:
lreg = LogisticRegression(max_iter=1000, multi_class='multinomial', class_weight = 'balanced')

In [19]:
from sklearn.multiclass import OneVsRestClassifier

In [20]:
model = OneVsRestClassifier(lreg)

In [21]:
model.fit(x_train, y_train_multilabel)



In [22]:
y_pred = model.predict(x_test)

In [25]:
print(classification_report(y_test_multilabel, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81      1650
           1       0.27      0.47      0.34       474
           2       0.57      0.58      0.58        98
           3       0.41      0.63      0.49       677
           4       0.46      0.54      0.50       379
           5       0.59      0.63      0.61      1787
           6       0.37      0.49      0.42        83
           7       0.41      0.57      0.48       726
           8       0.44      0.46      0.45       123

   micro avg       0.54      0.65      0.59      5997
   macro avg       0.48      0.58      0.52      5997
weighted avg       0.56      0.65      0.60      5997
 samples avg       0.56      0.66      0.58      5997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
