# RoBERTa fallacy detection


In [1]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    # Extraer predicciones y etiquetas reales del objeto EvalPrediction
    logits, labels = eval_pred

    # Convertir los logits en predicciones (usamos argmax para clasificaciÃ³n)
    predictions = logits.argmax(axis=-1)

    # Calcular las mÃ©tricas
    f1 = f1_score(labels, predictions, average="weighted")
    f1_class_0 = f1_score(labels, predictions, pos_label=0, average="binary")
    f1_class_1 = f1_score(labels, predictions, pos_label=1, average="binary")
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "f1_class_0": f1_class_0,
        "f1_class_1": f1_class_1
    }


In [3]:
from transformers import AutoModel, AutoTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/train_afd.csv")

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["Etiqueta"], random_state=42)
train_df = train_df.rename(columns={"Etiqueta": "labels"})
val_df = val_df.rename(columns={"Etiqueta": "labels"})

train_df["labels"] = train_df["labels"].astype(int)
val_df["labels"] = val_df["labels"].astype(int)

def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

In [6]:
for name, param in model.named_parameters():
    if any(f"roberta.encoder.layer.{i}." in name for i in range(8, 12)) or "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo despuÃ©s del entrenamiento
    metric_for_best_model="f1_class_1",
    greater_is_better=True,
    num_train_epochs=10,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.3386,0.284539,0.908294,0.865212,0.951929,0.006329
2,0.3515,0.356775,0.908879,0.866071,0.952235,0.012658
3,0.3037,0.31831,0.910631,0.886741,0.952543,0.235
4,0.2546,0.396105,0.903329,0.892044,0.947849,0.339321
5,0.2272,0.541519,0.905666,0.890851,0.949365,0.311301
6,0.1749,0.746537,0.900117,0.887864,0.946159,0.310484
7,0.1245,0.816412,0.892523,0.88181,0.941883,0.286822
8,0.0992,0.950344,0.886682,0.880623,0.938276,0.309609
9,0.0722,1.001355,0.889895,0.881801,0.940225,0.303142
10,0.0487,1.023264,0.88639,0.879363,0.938205,0.296564


TrainOutput(global_step=17120, training_loss=0.19903237814101105, metrics={'train_runtime': 4914.0884, 'train_samples_per_second': 27.867, 'train_steps_per_second': 3.484, 'total_flos': 3.190467022353408e+16, 'train_loss': 0.19903237814101105, 'epoch': 10.0})

In [7]:
from datasets import concatenate_datasets

# Concatenar los datasets de entrenamiento y validaciÃ³n
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [8]:

# AsegÃºrate de que el modelo es el mejor modelo obtenido al final del entrenamiento
model = trainer.model

# Reajustar los parÃ¡metros de entrenamiento, si lo deseas
final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo despuÃ©s del entrenamiento
    metric_for_best_model="f1",  # MÃ©trica para la selecciÃ³n del mejor modelo
    greater_is_better=True,
    num_train_epochs=3,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  # Coseno de descenso para la tasa de aprendizaje
)

# Crear un nuevo Trainer usando el conjunto de datos combinado
final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  # Usar el conjunto combinado de train y val
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

# Entrenar el modelo con todo el conjunto de datos
final_trainer.train()

  final_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.2594,0.272597,0.936332,0.923664,0.96598,0.504545
2,0.2071,0.200754,0.957944,0.955141,0.977143,0.737226
3,0.1434,0.185794,0.963493,0.960882,0.980174,0.769797


TrainOutput(global_step=6420, training_loss=0.20572091693818756, metrics={'train_runtime': 1439.6264, 'train_samples_per_second': 35.672, 'train_steps_per_second': 4.459, 'total_flos': 1.1964600808086528e+16, 'train_loss': 0.20572091693818756, 'epoch': 3.0})

In [10]:
# Paso 1: Cargar el conjunto de test desde un archivo CSV
test_df = pd.read_csv("data/test_afd.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"}) 
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [11]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Asegurarse de que el modelo estÃ¡ en el dispositivo correcto
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Paso 1: Preparar los tensores para el DataLoader
# Convertir a tensores y evitar el warning
input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

# Comprobar si token_type_ids estÃ¡n presentes
if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

# Paso 2: Crear DataLoader
dataloader = DataLoader(dataset, batch_size=8)

# Paso 3: Inferencia loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        # Mover los tensores al dispositivo (GPU o CPU)
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch
            )
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch
            )

        # Obtener las predicciones
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Predecir la clase con mayor probabilidad
        predictions.extend(batch_preds)

# Paso 4: Guardar predicciones en el DataFrame
test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [12]:
test_df

Unnamed: 0,Texto,labels,predicted_label
0,We got to take a look at what I was left when ...,,0
1,We had an economy that was in free fall.,,0
2,The pandemic was so badly handled.,,0
3,Many people were dying.,,1
4,"All he said was, it's not that serious.",,0
...,...,...,...
2170,She gave a lot of it away to the Taliban.,,0
2171,She gave it to Afghanistan.,,0
2172,What these people have done to our country and...,,0
2173,Many of them are criminals and they're destroy...,,1


In [13]:
# Ver el conteo de cada etiqueta en las predicciones
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    1921
1     254
Name: count, dtype: int64


In [14]:
test_df.to_csv("afc_roberta-large_text.csv", index=False)