In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (6

In [None]:
!pip install pysentimiento transformers accelerate evaluate

Collecting pysentimiento
  Downloading pysentimiento-0.7.3-py3-none-any.whl (39 kB)
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting emoji>=1.6.1 (from pysentimiento)
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch!=2.0.1,>=2.0.0->pysentimiento)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch!=2.0.1,>=2.0.0->pysentimiento)
  Using cached nvidia_cuda_

In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from pysentimiento.preprocessing import preprocess_tweet

# Cargar el archivo CSV en un DataFrame
file_path = "/sentiment_analysis_dataset.csv"  # Reemplaza con la ruta correcta a tu archivo CSV
df = pd.read_csv(file_path)

# Verificar las etiquetas únicas en el DataFrame
unique_sentiments = df["sentiment"].unique()
print(f"Sentimientos únicos en el dataset: {unique_sentiments}")

# Convertir los sentimientos a índices numéricos
sentiment_to_idx = {sentiment: idx for idx, sentiment in enumerate(unique_sentiments)}
df["sentiment_idx"] = df["sentiment"].map(sentiment_to_idx)
print(f"Mapeo de sentimientos a índices: {sentiment_to_idx}")

# Convertir el DataFrame a un Dataset de Hugging Face
dataset = Dataset.from_pandas(df)

# Preprocesar los tweets
preprocessed_ds = dataset.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

# Tokenizar el conjunto de datos
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(batch["text"], padding=False, truncation=True),
    batched=True, batch_size=32
)

# Dividir el conjunto de datos en entrenamiento y validación
train_test_split = tokenized_ds.train_test_split(test_size=0.2)
train_ds = train_test_split["train"]
val_ds = train_test_split["test"]

# Añadir las etiquetas al conjunto de datos
def add_labels(example):
    example['labels'] = sentiment_to_idx[example['sentiment']]  # Aquí usamos 'sentiment' en lugar de 'emotion'
    return example

train_ds = train_ds.map(add_labels)
val_ds = val_ds.map(add_labels)

# Verificar que todos los sentimientos en los conjuntos de datos están en sentiment_to_idx
train_sentiments = set(train_ds["sentiment"])
val_sentiments = set(val_ds["sentiment"])

print(f"Sentimientos en el conjunto de entrenamiento: {train_sentiments}")
print(f"Sentimientos en el conjunto de validación: {val_sentiments}")

# Asegurarse de que todos los sentimientos están en el diccionario
assert train_sentiments.issubset(sentiment_to_idx.keys()), "Hay sentimientos en el conjunto de entrenamiento que no están en sentiment_to_idx"
assert val_sentiments.issubset(sentiment_to_idx.keys()), "Hay sentimientos en el conjunto de validación que no están en sentiment_to_idx"

# Cargar el modelo
num_labels = len(sentiment_to_idx)  # Asegúrate de que este número corresponde al número de sentimientos
model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", num_labels=num_labels)

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="./results",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=10,
    logging_dir='./logs',
)

# Definir la función de métrica
def compute_metrics(eval_pred):
    from sklearn.metrics import precision_recall_fscore_support, accuracy_score
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Crear el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Entrenar el modelo
trainer.train()


Sentimientos únicos en el dataset: ['scared' 'mad' 'sad' 'peaceful' 'powerful' 'joyful']
Mapeo de sentimientos a índices: {'scared': 0, 'mad': 1, 'sad': 2, 'peaceful': 3, 'powerful': 4, 'joyful': 5}


Map:   0%|          | 0/2590 [00:00<?, ? examples/s]

Map:   0%|          | 0/2590 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Sentimientos en el conjunto de entrenamiento: {'sad', 'powerful', 'scared', 'peaceful', 'joyful', 'mad'}
Sentimientos en el conjunto de validación: {'powerful', 'sad', 'scared', 'peaceful', 'joyful', 'mad'}




{'eval_loss': 1.2220956087112427, 'eval_accuracy': 0.528957528957529, 'eval_f1': 0.5193608948910898, 'eval_precision': 0.6368372197453831, 'eval_recall': 0.528957528957529, 'eval_runtime': 3.0797, 'eval_samples_per_second': 168.196, 'eval_steps_per_second': 21.106, 'epoch': 1.0}
{'eval_loss': 0.7287326455116272, 'eval_accuracy': 0.747104247104247, 'eval_f1': 0.7389520507236682, 'eval_precision': 0.7736092909582474, 'eval_recall': 0.747104247104247, 'eval_runtime': 2.2868, 'eval_samples_per_second': 226.515, 'eval_steps_per_second': 28.424, 'epoch': 2.0}
{'eval_loss': 0.7528887391090393, 'eval_accuracy': 0.7644787644787645, 'eval_f1': 0.7647747844136856, 'eval_precision': 0.7697422160623094, 'eval_recall': 0.7644787644787645, 'eval_runtime': 3.3105, 'eval_samples_per_second': 156.472, 'eval_steps_per_second': 19.634, 'epoch': 3.0}
{'eval_loss': 0.7713905572891235, 'eval_accuracy': 0.7876447876447876, 'eval_f1': 0.7889378414094056, 'eval_precision': 0.8013733517603621, 'eval_recall': 0.7

TrainOutput(global_step=650, training_loss=0.4078755686833308, metrics={'train_runtime': 354.9689, 'train_samples_per_second': 58.371, 'train_steps_per_second': 1.831, 'train_loss': 0.4078755686833308, 'epoch': 10.0})

In [None]:
model_save_path = "./trained_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [None]:
!pip install torch



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pysentimiento.preprocessing import preprocess_tweet

model_save_path = "./trained_model"
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

sentiment_to_idx = {'sad': 0, 'powerful': 1, 'scared': 2, 'peaceful': 3, 'joyful': 4, 'mad':5}
#emotion_to_idx = {'overwhelmed': 0, 'embarrassed': 1, 'jealous': 2, 'irritated': 3, 'frustrated': 4, 'distant': 5, 'stupid': 6, 'isolated': 7, 'sleepy': 8, 'responsive': 9, 'relaxed': 10, 'loving': 11, 'thankful': 12, 'secure': 13, 'confident': 14, 'successful': 15, 'surprised': 16, 'playful': 17, 'optimistic': 18, 'daring': 19}
idx_to_emotion = {idx: emotion for emotion, idx in sentiment_to_idx.items()}

def predict_emotion(text):
    preprocessed_text = preprocess_tweet(text, lang="es")
    inputs = tokenizer(preprocessed_text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_label_idx = torch.argmax(logits, dim=1).item()

    predicted_emotion = idx_to_emotion[predicted_label_idx]
    return predicted_emotion

query = "Puta seleccion de mierda"
predicted_emotion = predict_emotion(query)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: powerful


In [None]:
# Evaluar con el conjunto de test
eval_result = trainer.evaluate(eval_dataset=val_ds)

print("Results on validation dataset:")
for key, value in eval_result.items():
    print(f"{key}: {value}")


{'eval_loss': 1.115962028503418, 'eval_accuracy': 0.7007722007722008, 'eval_f1': 0.6981548514346175, 'eval_precision': 0.710242597248353, 'eval_recall': 0.7007722007722008, 'eval_runtime': 3.4063, 'eval_samples_per_second': 152.072, 'eval_steps_per_second': 19.082, 'epoch': 10.0}
Results on validation dataset:
eval_loss: 1.115962028503418
eval_accuracy: 0.7007722007722008
eval_f1: 0.6981548514346175
eval_precision: 0.710242597248353
eval_recall: 0.7007722007722008
eval_runtime: 3.4063
eval_samples_per_second: 152.072
eval_steps_per_second: 19.082
epoch: 10.0
