In [1]:
#Descargamos la libreria de transformers que usaremos para descargar el modelo como su respectivo tokenizador
!pip  install transformers --quiet
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import Trainer
!pip install Datasets --quiet
from datasets import DatasetDict, Dataset
!pip install transformers[torch] --quiet
!pip install accelerate -U --quiet
!pip install transformers[torch]
#Installamos esta librería para el preprocesamiento necesario que requieré el modelo RoBertTuito:
!pip install pysentimiento --quiet
from pysentimiento.preprocessing import preprocess_tweet



In [2]:
# Importamos las dependencias necesarias :
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from google.colab import drive  #Montador de drive
from sklearn.model_selection import train_test_split
import gc

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 2. Importamos el conjunto de datos que vamos a usar para nuestro problema

In [3]:
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Datasets a limpio /EXIST 2021 dataset_esp.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,1,twitter,es,Nadie te va a tratar tan bien como un hombre q...,sexist,sexual-violence
1,EXIST2021,2,twitter,es,"@lindagisela74 Que rica putita obediente, afor...",sexist,stereotyping-dominance
2,EXIST2021,3,twitter,es,@BicireporteraDF Yo lo hice a los 18 años por ...,non-sexist,non-sexist
3,EXIST2021,4,twitter,es,las cosas q sueño son indicios de que yo enrea...,non-sexist,non-sexist
4,EXIST2021,5,twitter,es,"Pero a la niña le gustó desde que lo vió, así ...",non-sexist,non-sexist


In [5]:
# 1. Cambiamos el nombre de las columnas e eliminamos aquellas que no necesitamos:
columns_to_remove = ['test_case', 'id', 'source','language','task1']
df = df.rename(columns = {"task2": "label"}).drop(columns=columns_to_remove, axis=1)
df.head()

Unnamed: 0,text,label
0,Nadie te va a tratar tan bien como un hombre q...,sexual-violence
1,"@lindagisela74 Que rica putita obediente, afor...",stereotyping-dominance
2,@BicireporteraDF Yo lo hice a los 18 años por ...,non-sexist
3,las cosas q sueño son indicios de que yo enrea...,non-sexist
4,"Pero a la niña le gustó desde que lo vió, así ...",non-sexist


In [6]:
#2. Cambiamos los valores nominales de sexista y no sexista a valores numéricos
df['label'] = df['label'].replace(['non-sexist','sexual-violence', 'stereotyping-dominance', 'misogyny-non-sexual-violence', 'ideological-inequality','objectification'],[0, 1, 2 , 3, 4, 5])
df.head()

Unnamed: 0,text,label
0,Nadie te va a tratar tan bien como un hombre q...,1
1,"@lindagisela74 Que rica putita obediente, afor...",2
2,@BicireporteraDF Yo lo hice a los 18 años por ...,0
3,las cosas q sueño son indicios de que yo enrea...,0
4,"Pero a la niña le gustó desde que lo vió, así ...",0


In [7]:
# 3. Eliminamos cualquier fila que haya podido quedar en blanco:
df = df.dropna()

In [8]:
from pysentimiento.preprocessing import preprocess_tweet

In [9]:
df['text'].apply(preprocess_tweet)

0       Nadie te va a tratar tan bien como un hombre q...
1       @usuario Que rica putita obediente, afortunado...
2       @usuario Yo lo hice a los 18 años por la carre...
3       las cosas q sueño son indicios de que yo enrea...
4       Pero a la niña le gustó desde que lo vió, así ...
                              ...                        
5696    @usuario Se llama nota de corte, y es lo que d...
5697    @usuario Osea todo atack of titan parte de una...
5698    @usuario Cuéntame más!!Es por androcentrismo? ...
5699     Que duro es ser tan atractiva como Jaba de Hutt.
5700    @usuario A Pablo es que ya no le hacen caso en...
Name: text, Length: 5701, dtype: object

In [10]:
df.head()

Unnamed: 0,text,label
0,Nadie te va a tratar tan bien como un hombre q...,1
1,"@lindagisela74 Que rica putita obediente, afor...",2
2,@BicireporteraDF Yo lo hice a los 18 años por ...,0
3,las cosas q sueño son indicios de que yo enrea...,0
4,"Pero a la niña le gustó desde que lo vió, así ...",0


In [11]:
# 4. Dividiremos el dataset en el 80% para el entrenamiento, el 10% para el proceso de validation, y un 10% para testear los resultados del modelo.
train_df, valtest_df = train_test_split(df, test_size = 0.2, random_state = 42)
val_df, test_df = train_test_split(valtest_df, test_size = 0.5, random_state = 42)
train_df.shape, val_df.shape, test_df.shape

((4560, 2), (570, 2), (571, 2))

In [12]:
train = Dataset.from_pandas(train_df)
validation = Dataset.from_pandas(val_df)
test = Dataset.from_pandas(test_df)

In [13]:
train

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 4560
})



```
# Funciones para entrenar el modelo
```



In [14]:
# Define a function to train a model v2
class TrainModel:
    """

    Attributes:
      tokenizer:
      modelo:
    """

    def __init__(self, Modelo):
        # Check if execution will be performed on cuda:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(Modelo)
        self.modelo = AutoModelForSequenceClassification.from_pretrained(Modelo, num_labels=6).to(device)

    def tokenizador(self, batch):
        return self.tokenizer(batch["text"], padding=True, max_length=128, truncation=True)

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}

    def train(self, train, val):
        # Prepare the dataset:
        Tweets_Dataset = DatasetDict({'train': train, 'val': val, 'test': test})

        # Apply tokenization to the entire dataset using the map function:
        #Tweets_Dataset = Tweets_Dataset.remove_columns(["__index_level_0__"])
        Tweets_Encoded = Tweets_Dataset.map(self.tokenizador, batched=True, batch_size=None)


        # Ensure objects are of torch type
        Tweets_Encoded.set_format("torch", columns=["label", "input_ids", "attention_mask"])

        # Definimos los hiperparametros de cada modelo
        if self.modelo == 'sdadas/xlm-roberta-large-twitter':
            learning_rate = 2e-5
            per_device_train_batch_size = 16
            num_train_epochs=2
        elif self.modelo == 'Twitter/twhin-bert-base':
            learning_rate = 2e-5
            per_device_train_batch_size = 8
            num_train_epochs=3
        else:
            learning_rate = 2e-5
            per_device_train_batch_size = 16
            num_train_epochs=4

        training_args = TrainingArguments(
              output_dir='./results',
              num_train_epochs= num_train_epochs,
              learning_rate = learning_rate,
              per_device_train_batch_size= per_device_train_batch_size,
              per_device_eval_batch_size=8,
              warmup_steps=500,
              weight_decay=0.01,
              evaluation_strategy="epoch",
              logging_dir='./logs',
              )

        # Create the Trainer and train the model
        trainer = Trainer(
            model=self.modelo,
            args=training_args,
            compute_metrics=self.compute_metrics,
            train_dataset=Tweets_Encoded['train'],
            eval_dataset=Tweets_Encoded['val'],
        )
        trainer.train()

        return self.modelo, self.tokenizer

In [15]:
train = train.remove_columns(["__index_level_0__"])
validation = validation.remove_columns(["__index_level_0__"])
test = test.remove_columns(["__index_level_0__"])

In [16]:
train = Dataset.from_pandas(train_df)
validation = Dataset.from_pandas(val_df)
test = Dataset.from_pandas(test_df)

In [17]:
sdadas_trainer = TrainModel('sdadas/xlm-roberta-large-twitter')
twhin_trainer = TrainModel('Twitter/twhin-bert-base')
robertuito_trainer = TrainModel('pysentimiento/robertuito-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [18]:
Modelo1, tokenizador1 = sdadas_trainer.train(train, validation)
Modelo2, tokenizador2 = twhin_trainer.train(train, validation)
Modelo3, tokenizador3 = robertuito_trainer.train(train, validation)

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]



{'eval_loss': 0.9257956147193909, 'eval_accuracy': 0.6701754385964912, 'eval_f1': 0.6666343978380339, 'eval_runtime': 14.1797, 'eval_samples_per_second': 40.198, 'eval_steps_per_second': 5.078, 'epoch': 1.0}
{'loss': 1.1415, 'grad_norm': 19.385597229003906, 'learning_rate': 2e-05, 'epoch': 1.7543859649122808}
{'eval_loss': 0.7466591596603394, 'eval_accuracy': 0.7280701754385965, 'eval_f1': 0.7223564431906153, 'eval_runtime': 14.0756, 'eval_samples_per_second': 40.496, 'eval_steps_per_second': 5.115, 'epoch': 2.0}
{'eval_loss': 0.7836528420448303, 'eval_accuracy': 0.7403508771929824, 'eval_f1': 0.7376485641519102, 'eval_runtime': 14.153, 'eval_samples_per_second': 40.274, 'eval_steps_per_second': 5.087, 'epoch': 3.0}
{'loss': 0.4908, 'grad_norm': 8.045537948608398, 'learning_rate': 4.3750000000000005e-06, 'epoch': 3.5087719298245617}
{'eval_loss': 0.9376837611198425, 'eval_accuracy': 0.743859649122807, 'eval_f1': 0.7423651752282984, 'eval_runtime': 14.1233, 'eval_samples_per_second': 40

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]



{'eval_loss': 1.2703285217285156, 'eval_accuracy': 0.5070175438596491, 'eval_f1': 0.4503540779321628, 'eval_runtime': 4.5321, 'eval_samples_per_second': 125.768, 'eval_steps_per_second': 15.887, 'epoch': 1.0}
{'loss': 1.2712, 'grad_norm': 8.881034851074219, 'learning_rate': 2e-05, 'epoch': 1.7543859649122808}
{'eval_loss': 0.912324845790863, 'eval_accuracy': 0.6614035087719298, 'eval_f1': 0.6512871781745142, 'eval_runtime': 4.8198, 'eval_samples_per_second': 118.262, 'eval_steps_per_second': 14.938, 'epoch': 2.0}
{'eval_loss': 0.8667500019073486, 'eval_accuracy': 0.6929824561403509, 'eval_f1': 0.691787278095, 'eval_runtime': 4.4987, 'eval_samples_per_second': 126.703, 'eval_steps_per_second': 16.005, 'epoch': 3.0}
{'loss': 0.7475, 'grad_norm': 14.246040344238281, 'learning_rate': 4.3750000000000005e-06, 'epoch': 3.5087719298245617}
{'eval_loss': 0.8892265558242798, 'eval_accuracy': 0.7, 'eval_f1': 0.7012567027602489, 'eval_runtime': 4.4518, 'eval_samples_per_second': 128.039, 'eval_ste

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]



{'eval_loss': 1.1991662979125977, 'eval_accuracy': 0.5350877192982456, 'eval_f1': 0.4319000798534546, 'eval_runtime': 4.0162, 'eval_samples_per_second': 141.926, 'eval_steps_per_second': 17.928, 'epoch': 1.0}
{'loss': 1.2923, 'grad_norm': 9.497695922851562, 'learning_rate': 2e-05, 'epoch': 1.7543859649122808}
{'eval_loss': 0.8342231512069702, 'eval_accuracy': 0.6929824561403509, 'eval_f1': 0.6874142646222234, 'eval_runtime': 4.1218, 'eval_samples_per_second': 138.289, 'eval_steps_per_second': 17.468, 'epoch': 2.0}
{'eval_loss': 0.7949350476264954, 'eval_accuracy': 0.7105263157894737, 'eval_f1': 0.7077522730764589, 'eval_runtime': 4.0167, 'eval_samples_per_second': 141.908, 'eval_steps_per_second': 17.925, 'epoch': 3.0}
{'loss': 0.6338, 'grad_norm': 6.869324684143066, 'learning_rate': 4.3750000000000005e-06, 'epoch': 3.5087719298245617}
{'eval_loss': 0.8184494376182556, 'eval_accuracy': 0.7140350877192982, 'eval_f1': 0.7108312069915662, 'eval_runtime': 4.0293, 'eval_samples_per_second':

In [19]:
modelos = [Modelo1,Modelo2,Modelo3]
tokenizadores = [tokenizador1,tokenizador2,tokenizador3]

In [21]:
def majority_voting(models, tokenizers, texts):
    # Initialize a list to store the majority vote for each text
    majority_votes = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Iterate over each text
    for text in texts:
        # Get the predictions from each model
        predictions = []
        for model, tokenizer in zip(models, tokenizers):
            # Tokenize the input text
            inputs = tokenizer(text, return_tensors='pt', padding=True, max_length=128, truncation=True)

            # Move inputs to the correct device
            inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

            with torch.no_grad():
                model_logits = model(**inputs).logits
                _, preds = torch.max(model_logits, dim=1)
                predictions.append(preds)

        # Perform majority voting
        predictions = torch.stack(predictions).to(device)  # Ensure predictions tensor is on the correct device
        majority_vote, _ = torch.mode(predictions, dim=0)

        # Append the majority vote to the list
        majority_votes.append(majority_vote.item())

    return majority_votes

In [22]:
predictions = majority_voting(modelos, tokenizadores, validation['text'])

In [23]:
def compute_metrics(true_labels, predictions):
    # Compute the accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Compute the F1 score
    f1 = f1_score(true_labels, predictions, average='weighted')

    return accuracy, f1

In [24]:
accuracy , f1 = compute_metrics(validation['label'],predictions)

In [25]:
accuracy

0.7543859649122807

In [26]:
f1

0.7512968844677963

In [None]:
 Define a function to train a model v2
class TrainModel:
    """

    Attributes:
      tokenizer:
      modelo:
    """

    def __init__(self, Modelo):
        # Check if execution will be performed on cuda:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(Modelo)
        self.modelo = AutoModelForSequenceClassification.from_pretrained(Modelo, num_labels=2).to(device)

    def tokenizador(self, batch):
        return self.tokenizer(batch["text"], padding=True, max_length=128, truncation=True)

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}

    def train(self, train, val):
        # Prepare the dataset:
        Tweets_Dataset = DatasetDict({'train': train, 'val': val, 'test': test})

        # Apply tokenization to the entire dataset using the map function:
        Tweets_Dataset = Tweets_Dataset.remove_columns(["__index_level_0__"])
        Tweets_Encoded = Tweets_Dataset.map(self.tokenizador, batched=True, batch_size=None)

        # Ensure objects are of torch type
        Tweets_Encoded.set_format("torch", columns=["label", "input_ids", "attention_mask"])

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=2,
            learning_rate = 2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            logging_dir='./logs',
        )

        # Create the Trainer and train the model
        trainer = Trainer(
            model=self.modelo,
            args=training_args,
            compute_metrics=self.compute_metrics,
            train_dataset=Tweets_Encoded['train'],
            eval_dataset=Tweets_Encoded['val'],
        )
        trainer.train()

        return self.modelo, self.tokenizer