In [69]:
import transformers
from datasets import Dataset, DatasetDict, ClassLabel
import pandas as pd
import numpy as np
import evaluate
import torch
from transformers import pipeline

In [70]:
df = pd.read_parquet("../data/cleaned/isear_cleaned.parquet")
df.head(3)

Unnamed: 0,text,emotion
0,when a boy tried to fool me so he would be ok ...,anger
1,i felt anger when i saw that i was being misle...,anger
2,once a friend had pushed me and i had fallen o...,anger


In [71]:
# Convert the pandas DataFrame into a Dataset
df_dict = Dataset.from_pandas(df)
df_dict = df_dict.class_encode_column("emotion")


# Realizar la primera división estratificada por la columna 'emotion' para obtener el conjunto de train y test+valid
train_testvalid = df_dict.train_test_split(
    test_size=0.2, stratify_by_column="emotion", seed=46
)

# Dividir el conjunto de test+valid en validación y prueba (50% validación, 50% prueba)
test_valid = train_testvalid["test"].train_test_split(
    test_size=0.5, stratify_by_column="emotion", seed=46
)

# Crear el DatasetDict con train, validation y test
dataset_dict = DatasetDict(
    {
        "train": train_testvalid["train"],
        "validation": test_valid["train"],
        "test": test_valid["test"],
    }
)

# Remover la columna '__index_level_0__' que no es necesaria
dataset_dict = dataset_dict.remove_columns(["__index_level_0__"])

# Verificar el resultado
dataset_dict


Casting to class labels:   0%|          | 0/7534 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion'],
        num_rows: 6027
    })
    validation: Dataset({
        features: ['text', 'emotion'],
        num_rows: 753
    })
    test: Dataset({
        features: ['text', 'emotion'],
        num_rows: 754
    })
})

In [72]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)


def preprocess_function(examples, func_tokenizer):
    return func_tokenizer(examples["text"], truncation=True)


tokenized_dataset = dataset_dict.map(
    preprocess_function,
    batched=True,
    # num_proc=20,
    fn_kwargs={"func_tokenizer": tokenizer},
)

tokenized_dataset = tokenized_dataset.rename_column("emotion", "label")
print(tokenized_dataset)

Map:   0%|          | 0/6027 [00:00<?, ? examples/s]

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 6027
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 753
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 754
    })
})


In [73]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

id2label = {
    0: "anger",
    1: "disgust",
    2: "fear",
    3: "guilt",
    4: "joy",
    5: "sadness",
    6: "shame",
}
label2id = {
    "anger": 0,
    "disgust": 1,
    "fear": 2,
    "guilt": 3,
    "joy": 4,
    "sadness": 5,
    "shame": 6,
}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=7, id2label=id2label, label2id=label2id
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
training_args = transformers.TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
)



In [75]:
recall = evaluate.load("recall")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    pre = precision.compute(
        predictions=predictions, references=labels, average="weighted"
    )["precision"]
    rec = recall.compute(
        predictions=predictions, references=labels, average="weighted"
    )["recall"]
    f1_score = f1.compute(
        predictions=predictions, references=labels, average="weighted"
    )["f1"]
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": pre, "recall": rec, "f1": f1_score, "accuracy": acc}


trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.21211,0.660411,0.623342,0.611819,0.623342
2,No log,0.940745,0.679804,0.679045,0.675059,0.679045
3,No log,0.861924,0.709336,0.706897,0.707321,0.706897
4,No log,0.863319,0.723961,0.717507,0.718522,0.717507
5,No log,0.847826,0.724313,0.720159,0.72044,0.720159
6,0.923500,0.877021,0.728411,0.720159,0.720292,0.720159
7,0.923500,0.913028,0.726986,0.724138,0.723103,0.724138
8,0.923500,0.948107,0.717772,0.712202,0.712689,0.712202
9,0.923500,1.036212,0.707482,0.70557,0.703256,0.70557
10,0.923500,1.050014,0.718512,0.718833,0.717487,0.718833


TrainOutput(global_step=1900, training_loss=0.3340263527318051, metrics={'train_runtime': 638.6378, 'train_samples_per_second': 188.745, 'train_steps_per_second': 2.975, 'total_flos': 1902658917438270.0, 'train_loss': 0.3340263527318051, 'epoch': 20.0})

In [79]:
trainer.evaluate()

{'eval_loss': 0.8478260040283203,
 'eval_precision': 0.7243129801351008,
 'eval_recall': 0.7201591511936339,
 'eval_f1': 0.7204401267960703,
 'eval_accuracy': 0.7201591511936339,
 'eval_runtime': 2.1809,
 'eval_samples_per_second': 345.721,
 'eval_steps_per_second': 5.502,
 'epoch': 20.0}

In [80]:
trainer.save_model("models/distilbert-base-uncased-finetuned")

In [81]:
# trainer.save_model("./my_model")
text = "You shouldnt said that. I hate you!"
classifier = pipeline(
    "text-classification",
    model="models/distilbert-base-uncased-finetuned/",
    device="cuda",
)
print(classifier(text))

[{'label': 'guilt', 'score': 0.5378169417381287}]
