In [8]:
!pip install evaluate




# Datasets:
https://www.kaggle.com/datasets/aadyasingh55/twitter-emotion-classification-dataset
https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text

In [9]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch.nn as nn
import numpy as np
import evaluate
import torch


# 1. Cargar y unir datasets
df1 = pd.read_csv('tweet_emotions.csv')          # columnas: tweet_id, sentiment, content
df2 = pd.read_parquet('train-00000-of-00001.parquet')  # columnas: text, label

# df1 tiene etiquetas en texto. df2 tiene etiquetas numéricas
emotion_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
df2['label'] = df2['label'].map(emotion_map)

df1 = df1.rename(columns={'content': 'text', 'sentiment': 'label'})

df = pd.concat([df1[['text', 'label']], df2[['text', 'label']]], ignore_index=True)



In [10]:
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [11]:


# 2. Filtrar solo las emociones válidas
etiquetas_validas = ['joy', 'anger', 'sadness', 'disgust', 'fear', 'neutral', 'surprise']
df = df[df['label'].isin(etiquetas_validas)].reset_index(drop=True)

# 3. Mapear etiquetas a IDs
label2id = {lbl: i for i, lbl in enumerate(etiquetas_validas)}
id2label = {i: lbl for lbl, i in label2id.items()}
df['label'] = df['label'].map(label2id)

# 4. Convertir a HuggingFace Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Cargar modelo y tokenizer multilingüe (XLM-Roberta)
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(etiquetas_validas),
)

model.config.id2label = id2label
model.config.label2id = label2id

# 6. Tokenización
def preprocess(examples):
    return tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized = dataset.map(preprocess, batched=True)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/31741 [00:00<?, ? examples/s]

Map:   0%|          | 0/7936 [00:00<?, ? examples/s]

In [12]:

# 7. Métrica
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# 8. Parámetros de entrenamiento
training_args = TrainingArguments(
    output_dir="./emotion_xlmr",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 10. Entrenar
trainer.train()

# 11. Guardar modelo
trainer.save_model("./emotion_xlmr")
tokenizer.save_pretrained("./emotion_xlmr")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2606,0.191738,0.936492
2,0.1512,0.139803,0.942414
3,0.1042,0.128958,0.946195


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

In [23]:
label2spanish = {
    'joy': 'alegría',
    'anger': 'ira',
    'sadness': 'tristeza',
    'disgust': 'disgusto',
    'fear': 'miedo',
    'neutral': 'neutral',
    'surprise': 'sorpresa'
}

model.config.label2spanish = label2spanish


In [24]:
model.save_pretrained("./emotion_xlmr")
tokenizer.save_pretrained("./emotion_xlmr")


('./emotion_xlmr/tokenizer_config.json',
 './emotion_xlmr/special_tokens_map.json',
 './emotion_xlmr/sentencepiece.bpe.model',
 './emotion_xlmr/added_tokens.json',
 './emotion_xlmr/tokenizer.json')

In [22]:

# 12. Inferencia
test_text = "Estoy feliz con mi trabajo"
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits


pred = logits.argmax(dim=1).item()
print(test_text)
print("Predicción:", id2label[pred])


Estoy feliz con mi trabajo
Predicción: joy


In [21]:
# 12. Inferencia
test_text = "Estoy muy molesta"
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits


pred = logits.argmax(dim=1).item()
print(test_text)
print("Predicción:", id2label[pred])


Estoy muy molesta
Predicción: anger


In [25]:
# Uso del modelo
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("./emotion_xlmr")
tokenizer = AutoTokenizer.from_pretrained("./emotion_xlmr")

label2spanish = model.config.label2spanish


The tokenizer you are loading from './emotion_xlmr' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [28]:

test_text = "Estoy muy molesta"
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits


pred = logits.argmax(dim=1).item()
print(test_text)

pred_eng = id2label[pred]
pred_es = model.config.label2spanish[pred_eng]

print("Predicción:", pred_es)


Estoy muy molesta
Predicción: ira


In [29]:
!zip -r emotion_xlmr.zip emotion_xlmr


  adding: emotion_xlmr/ (stored 0%)
  adding: emotion_xlmr/sentencepiece.bpe.model (deflated 49%)
  adding: emotion_xlmr/special_tokens_map.json (deflated 52%)
  adding: emotion_xlmr/config.json (deflated 56%)
  adding: emotion_xlmr/tokenizer.json (deflated 76%)
  adding: emotion_xlmr/training_args.bin (deflated 53%)
  adding: emotion_xlmr/model.safetensors (deflated 25%)
  adding: emotion_xlmr/checkpoint-1984/ (stored 0%)
  adding: emotion_xlmr/checkpoint-1984/trainer_state.json (deflated 60%)
  adding: emotion_xlmr/checkpoint-1984/rng_state.pth (deflated 26%)
  adding: emotion_xlmr/checkpoint-1984/sentencepiece.bpe.model (deflated 49%)
  adding: emotion_xlmr/checkpoint-1984/special_tokens_map.json (deflated 52%)
  adding: emotion_xlmr/checkpoint-1984/scheduler.pt (deflated 61%)
  adding: emotion_xlmr/checkpoint-1984/config.json (deflated 53%)
  adding: emotion_xlmr/checkpoint-1984/tokenizer.json (deflated 76%)
  adding: emotion_xlmr/checkpoint-1984/training_args.bin (deflated 53%)
  

In [30]:
from google.colab import files
files.download("emotion_xlmr.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>