# ROBERTA fallacy classification


In [1]:
!pip install datasets
!pip install transformers==4.17

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0

In [17]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


In [18]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, predictions, average="macro")
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1": f1
    }

In [20]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/dfe8f1ad04cb25b61a647e3d13620f9bf0a0f51d277897b232a5735297134132.024cc07195c0ba0b51d4f80061c6115996ff26233f3d04788855b23cdf13fbd5
loading configuratio

In [21]:
def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=128)

train_df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/train_afc.csv")

train_df_split, val_df_split = train_test_split(train_df, test_size=0.2, stratify=train_df['Etiqueta'], random_state=42)

train_df_split = train_df_split.rename(columns={"Etiqueta": "labels"})
val_df_split = val_df_split.rename(columns={"Etiqueta": "labels"})

train_dataset = Dataset.from_pandas(train_df_split)
val_dataset = Dataset.from_pandas(val_df_split)


tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

Map:   0%|          | 0/982 [00:00<?, ? examples/s]

Map:   0%|          | 0/246 [00:00<?, ? examples/s]

In [22]:
for name, param in model.named_parameters():
    if any(f"roberta.encoder.layer.{i}." in name for i in range(8, 12)) or "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo después del entrenamiento
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    num_train_epochs=20,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 982
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2460
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,1.147068,0.617886,0.127303,0.471953
2,No log,1.003883,0.621951,0.213049,0.546149
3,No log,0.990137,0.642276,0.32801,0.616453
4,No log,0.95883,0.695122,0.442154,0.678715
5,0.902200,1.039236,0.707317,0.473922,0.68111
6,0.902200,1.16434,0.695122,0.501824,0.682962
7,0.902200,1.310313,0.691057,0.484488,0.675505
8,0.902200,1.463288,0.658537,0.484628,0.656878
9,0.276400,1.530928,0.703252,0.523805,0.69312
10,0.276400,1.592306,0.703252,0.445754,0.674531


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 246
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-123
Configuration saved in ./results/checkpoint-123/config.json
Model weights saved in ./results/checkpoint-123/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-123/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-123/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-616] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not 

TrainOutput(global_step=2460, training_loss=0.27016127826721686, metrics={'train_runtime': 370.3148, 'train_samples_per_second': 53.036, 'train_steps_per_second': 6.643, 'total_flos': 1291921678725120.0, 'train_loss': 0.27016127826721686, 'epoch': 20.0})

In [23]:
from datasets import concatenate_datasets

# Concatenar los datasets de entrenamiento y validación
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [24]:

# Asegúrate de que el modelo es el mejor modelo obtenido al final del entrenamiento
model = trainer.model

# Reajustar los parámetros de entrenamiento, si lo deseas
final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo después del entrenamiento
    metric_for_best_model="f1_macro",  # Métrica para la selección del mejor modelo
    greater_is_better=True,
    num_train_epochs=5,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  # Coseno de descenso para la tasa de aprendizaje
)

# Crear un nuevo Trainer usando el conjunto de datos combinado
final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  # Usar el conjunto combinado de train y val
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

# Entrenar el modelo con todo el conjunto de datos
final_trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1228
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 770
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,0.694945,0.821138,0.617268,0.805262
2,No log,0.457888,0.902439,0.799687,0.894577
3,No log,0.272589,0.930894,0.865339,0.928887
4,0.249800,0.231658,0.930894,0.83804,0.925798
5,0.249800,0.215822,0.934959,0.856232,0.930727


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 246
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-154
Configuration saved in ./results/checkpoint-154/config.json
Model weights saved in ./results/checkpoint-154/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-154/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-154/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-1107] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not

TrainOutput(global_step=770, training_loss=0.1963884279325411, metrics={'train_runtime': 110.0562, 'train_samples_per_second': 55.79, 'train_steps_per_second': 6.996, 'total_flos': 403889974917120.0, 'train_loss': 0.1963884279325411, 'epoch': 5.0})

In [25]:
# Paso 1: Cargar el conjunto de test desde un archivo CSV
test_df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/test_afc.csv")

# Paso 2: Renombrar las columnas
test_df = test_df.rename(columns={"Etiqueta": "labels"})  # Si la columna en test también se llama "Etiqueta"

# Paso 3: Convertir a un Dataset de Hugging Face
test_dataset = Dataset.from_pandas(test_df)

# Paso 4: Aplicar la función de tokenización al conjunto de test
tokenized_test = test_dataset.map(tokenize_function)

# Paso 5: Establecer el formato para usarlo con PyTorch
tokenized_test.set_format("torch")

# Ahora puedes usar tokenized_test como entrada para tu modelo.

Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

In [26]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Asegurarse de que el modelo está en el dispositivo correcto
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Paso 1: Preparar los tensores para el DataLoader
# Convertir a tensores y evitar el warning
input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

# Comprobar si token_type_ids están presentes
if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

# Paso 2: Crear DataLoader
dataloader = DataLoader(dataset, batch_size=8)

# Paso 3: Inferencia loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        # Mover los tensores al dispositivo (GPU o CPU)
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch
            )
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch
            )

        # Obtener las predicciones
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Predecir la clase con mayor probabilidad
        predictions.extend(batch_preds)

# Paso 4: Guardar predicciones en el DataFrame
test_df['predicted_label'] = predictions


  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [27]:
test_df

Unnamed: 0,Texto,labels,predicted_label
0,We got to take a look at what I was left when ...,,0
1,We had an economy that was in free fall.,,0
2,The pandemic was so badly handled.,,0
3,Many people were dying.,,0
4,"All he said was, it's not that serious.",,2
...,...,...,...
2155,She gave a lot of it away to the Taliban.,,0
2156,She gave it to Afghanistan.,,0
2157,What these people have done to our country and...,,0
2158,Many of them are criminals and they're destroy...,,0


In [29]:
# Ver el conteo de cada etiqueta en las predicciones
label_counts = test_df['predicted_label'].value_counts()

# Mostrar los resultados
print(label_counts)

predicted_label
0    1264
2     526
1     224
3      56
4      50
5      40
Name: count, dtype: int64


In [30]:
test_df.to_csv("predicciones.csv", index=False)