# Análisis de sentimientos con distintas versiones de BERT

## Load

In [1]:
! pip install datasets
! pip install evaluate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
def preprocess_text(filepath):
    labels, texts = [], []
    with open(filepath) as f:
        for line in f.readlines():
            test_text = line.split("\t")
            labels.append(test_text[1])
            texts.append(test_text[2][:-1])
    return labels, texts

In [5]:
train_y, train_x = preprocess_text(
    "/content/drive/MyDrive/Proyecto LC/ficheros/training.txt"
)
test_y, test_x = preprocess_text("/content/drive/MyDrive/Proyecto LC/ficheros/test.txt")
val_y, val_x = preprocess_text(
    "/content/drive/MyDrive/Proyecto LC/ficheros/development.txt"
)

In [6]:
label2id = {"NEU": 3, "NONE": 2, "P": 1, "N": 0}
id2label = {0: "N", 1: "P", 2: "NONE", 3: "NEU"}


train_y = [label2id[label] for label in train_y]
val_y = [label2id[label] for label in val_y]

In [7]:
print(f"Train: {train_x[0]}, {train_y[0]}")
print(f"Val: {val_x[0]}, {val_y[0]}")

Train: -Me caes muy bien  -Tienes que jugar más partidas al lol con Russel y conmigo -Por qué tan Otako, deja de ser otako -Haber si me muero, 2
Val: @noseashetero 1000/10 de verdad a ti que voy a decir petarda que te quiero más que a mí mismo  ✨, 1


In [8]:
len(train_x), len(train_y), len(val_x), len(val_y)

(1008, 1008, 506, 506)

## Preprocess


In [9]:
! pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [10]:
import emoji
import re

In [11]:
def preprocess(text):
    text = emoji.demojize(text, language="es")
    text = re.sub(r"http\S+|www.\S+", "", text)  # Elimina URLs
    text = re.sub(r"@\w+", "", text)  # Elimina menciones

    return text

In [12]:
train_x = [preprocess(text) for text in train_x]
val_x = [preprocess(text) for text in val_x]
test_p = [preprocess(text) for text in test_x]

In [13]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


# Calcular los class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(train_y), y=train_y
)

# Convertir los pesos en un diccionario donde las claves son las clases y los valores son los pesos
class_weight_dict = dict(zip(np.unique(train_y), class_weights))

print(class_weight_dict)

{0: 0.6028708133971292, 1: 0.7924528301886793, 2: 1.8129496402877698, 3: 1.894736842105263}


## Train


### BERT-base en español

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np

In [15]:
# model_name="distilbert/distilbert-base-uncased" # F1 0.51
# model_name='distilbert/distilbert-base-multilingual-cased'
# model_name="PlanTL-GOB-ES/roberta-large-bne" #  F1:0.626(lr2e-5) -- 0.651 (lr1e-5)
model_name = "dccuchile/bert-base-spanish-wwm-cased"  # F1: 0.633 -- 0.607 (lr1e-5)
# model_name='vinai/bertweet-base' # 0.4564 (lr1e-5)

In [16]:
ds_tr = Dataset.from_dict({"text": train_x, "label": train_y})
ds_v = Dataset.from_dict({"text": val_x, "label": val_y})
ds_te = Dataset.from_dict({"text": test_x, "label": test_y})
ds = DatasetDict({"train": ds_tr, "validation": ds_v, "test": ds_te})

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]



In [18]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/1008 [00:00<?, ? examples/s]

Map:   0%|          | 0/506 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:

# Cargamos la métrica F1
f1 = evaluate.load("f1")


def compute_metrics(p):
    predictions, labels = p
    # Convierte las predicciones en etiquetas
    predictions = np.argmax(predictions, axis=1)
    # Calcula F1 (para clasificación multiclase, usamos average='weighted')
    return f1.compute(predictions=predictions, references=labels, average="weighted")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [21]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=4, id2label=id2label, label2id=label2id, from_pt=True
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["validation"],
    shuffle=False,
    batch_size=32,
    collate_fn=data_collator,
)

In [23]:
from transformers import create_optimizer
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
import keras

batch_size = 32
num_epochs = 10
batches_per_epoch = len(tokenized_ds["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=5e-6,
    num_warmup_steps=int(total_train_steps * 0.1),
    num_train_steps=total_train_steps,
)
model.compile(optimizer=optimizer)

In [24]:
import numpy as np
from transformers import create_optimizer
import tensorflow as tf

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

# Variables para seguimiento del mejor modelo
best_f1_score = 0
best_weights = None
patience = 3
epochs_without_improvement = 0

for epoch in range(10):
    print(f"\nEpoch {epoch + 1}/{10}")

    # Entrenamiento en cada época
    model.fit(
        x=tf_train_set,
        epochs=1,
        batch_size=32,
        validation_data=tf_validation_set,
        class_weight=class_weight_dict,
    )

    # Evaluación en el conjunto de validación
    y_pred = model.predict(tf_validation_set)
    y_pred_classes = np.argmax(y_pred.logits, axis=1)
    val_f1_score = f1.compute(
        predictions=y_pred_classes, references=val_y, average="weighted"
    )["f1"]

    print(f"F1 Score en Validación: {val_f1_score}")

    # Verificación de mejora
    if val_f1_score > best_f1_score:
        best_f1_score = val_f1_score
        best_weights = model.get_weights()  # Guarda los pesos actuales
        epochs_without_improvement = 0  # Reinicia la paciencia
        print("Mejora en el F1, guardando el modelo.")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("No hubo mejora durante varias épocas. Parando el entrenamiento.")
            break

# Restaurar los mejores pesos al final del entrenamiento
if best_weights is not None:
    model.set_weights(best_weights)
    print("Restauración de los mejores pesos completada.")


Epoch 1/10
F1 Score en Validación: 0.28186125339529766
Mejora en el F1, guardando el modelo.

Epoch 2/10
F1 Score en Validación: 0.621941334777994
Mejora en el F1, guardando el modelo.

Epoch 3/10
F1 Score en Validación: 0.6236591538872721
Mejora en el F1, guardando el modelo.

Epoch 4/10
F1 Score en Validación: 0.63664573737977
Mejora en el F1, guardando el modelo.

Epoch 5/10
F1 Score en Validación: 0.6416327602455473
Mejora en el F1, guardando el modelo.

Epoch 6/10
F1 Score en Validación: 0.645302797975335
Mejora en el F1, guardando el modelo.

Epoch 7/10
F1 Score en Validación: 0.6269965937766916

Epoch 8/10
F1 Score en Validación: 0.6308424633193771

Epoch 9/10
F1 Score en Validación: 0.6421798330314589
No hubo mejora durante varias épocas. Parando el entrenamiento.
Restauración de los mejores pesos completada.


In [25]:
y_pred = model.predict(tf_validation_set)

# Convierte las probabilidades en etiquetas de clase (multiclase)
y_pred_classes = np.argmax(y_pred.logits, axis=1)

# Calcula el F1 ponderado
f1.compute(predictions=y_pred_classes, references=val_y, average="weighted")



{'f1': 0.645302797975335}

### RoBERT

In [26]:
# model_name="distilbert/distilbert-base-uncased" # F1 0.51
# model_name='distilbert/distilbert-base-multilingual-cased'
model_name = "PlanTL-GOB-ES/roberta-large-bne"  #  F1:0.626(lr2e-5) -- 0.651 (lr1e-5)
# model_name="dccuchile/bert-base-spanish-wwm-cased" # F1: 0.633 -- 0.607 (lr1e-5)
# model_name='vinai/bertweet-base' # 0.4564 (lr1e-5)

In [27]:
ds_tr = Dataset.from_dict({"text": train_x, "label": train_y})
ds_v = Dataset.from_dict({"text": val_x, "label": val_y})
ds_te = Dataset.from_dict({"text": test_x, "label": test_y})
ds = DatasetDict({"train": ds_tr, "validation": ds_v, "test": ds_te})

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]



In [29]:
tokenized_ds = ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

Map:   0%|          | 0/1008 [00:00<?, ? examples/s]

Map:   0%|          | 0/506 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

In [30]:
import evaluate
import numpy as np

# Cargamos la métrica F1
f1 = evaluate.load("f1")


def compute_metrics(p):
    predictions, labels = p
    # Convierte las predicciones en etiquetas
    predictions = np.argmax(predictions, axis=1)
    # Calcula F1 (para clasificación multiclase, usamos average='weighted')
    return f1.compute(predictions=predictions, references=labels, average="weighted")

In [31]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=4, id2label=id2label, label2id=label2id, from_pt=True
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [32]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [33]:
from transformers import create_optimizer
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
import keras

batch_size = 16
num_epochs = 10
batches_per_epoch = len(tokenized_ds["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=5e-6,
    num_warmup_steps=int(total_train_steps * 0.1),
    num_train_steps=total_train_steps,
)
model.compile(optimizer=optimizer)

In [None]:

from transformers import create_optimizer
import tensorflow as tf

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

# Variables para seguimiento del mejor modelo
best_f1_score = 0
best_weights = None
patience = 3
epochs_without_improvement = 0

for epoch in range(10):
    print(f"\nEpoch {epoch + 1}/{10}")

    # Entrenamiento en cada época
    model.fit(
        x=tf_train_set,
        epochs=1,
        batch_size=32,
        validation_data=tf_validation_set,
        class_weight=class_weight_dict,
    )

    # Evaluación en el conjunto de validación
    y_pred = model.predict(tf_validation_set)
    y_pred_classes = np.argmax(y_pred.logits, axis=1)
    val_f1_score = f1.compute(
        predictions=y_pred_classes, references=val_y, average="weighted"
    )["f1"]

    print(f"F1 Score en Validación: {val_f1_score}")

    # Verificación de mejora
    if val_f1_score > best_f1_score:
        best_f1_score = val_f1_score
        best_weights = model.get_weights()  # Guarda los pesos actuales
        epochs_without_improvement = 0  # Reinicia la paciencia
        print("Mejora en el F1, guardando el modelo.")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("No hubo mejora durante varias épocas. Parando el entrenamiento.")
            break

# Restaurar los mejores pesos al final del entrenamiento
if best_weights is not None:
    model.set_weights(best_weights)
    print("Restauración de los mejores pesos completada.")


Epoch 1/10
F1 Score en Validación: 0.6295466034312599
Mejora en el F1, guardando el modelo.

Epoch 2/10
F1 Score en Validación: 0.6297759234267826
Mejora en el F1, guardando el modelo.

Epoch 3/10
F1 Score en Validación: 0.6219766033629605

Epoch 4/10
F1 Score en Validación: 0.6269056084320803

Epoch 5/10
F1 Score en Validación: 0.626829525417316
No hubo mejora durante varias épocas. Parando el entrenamiento.
Restauración de los mejores pesos completada.


In [38]:
y_pred = model.predict(tf_validation_set)

# Convierte las probabilidades en etiquetas de clase (multiclase)
y_pred_classes = np.argmax(y_pred.logits, axis=1)

# Calcula el F1 ponderado
f1.compute(predictions=y_pred_classes, references=val_y, average="weighted")



{'f1': 0.6297759234267826}

## Outputs

In [36]:
import numpy as np
from collections import Counter

test_identifiers = []
with open("/content/drive/MyDrive/Proyecto LC/ficheros/test.txt", "r") as f:
    for line in f:
        parts = line.split("\t")
        test_identifiers.append(parts[0])

test_encodings = tokenizer(
    test_x, truncation=True, padding=True, max_length=512, return_tensors="tf"
)
y_test_pred = model.predict(test_encodings)

# Obtener las clases predichas
y_test_pred_classes = np.argmax(y_test_pred.logits, axis=1)
print("Predicciones de clases (primeros 10):", y_test_pred_classes[:10])

# Convertir predicciones a etiquetas
id_to_label = {v: k for k, v in label2id.items()}
y_test_labels = [id_to_label[pred] for pred in y_test_pred_classes]

# Mostrar el conteo de etiquetas
print("Conteo de etiquetas:", Counter(y_test_labels))

# Guardar el resultado en Drive
output_path = "/content/drive/MyDrive/Proyecto LC/ficheros/resultado.txt"
with open(output_path, "w") as f:
    for tweet_id, label in zip(test_identifiers, y_test_labels):
        f.write(f"{tweet_id}\t{label}\n")

print(f"Archivo {output_path} generado con las predicciones correctamente.")

Predicciones de clases (primeros 10): [3 1 1 1 0 1 0 0 2 0]
Conteo de etiquetas: Counter({'N': 777, 'P': 699, 'NEU': 226, 'NONE': 197})
Archivo /content/drive/MyDrive/Proyecto LC/ficheros/resultado.txt generado con las predicciones correctamente.
