# SBERT fallacy classification


In [12]:
from transformers import Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, predictions, average="macro")
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1": f1
    }

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "microsoft/MiniLM-L12-H384-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=256)

train_df = pd.read_csv("data/train_afc.csv")

train_df_split, val_df_split = train_test_split(train_df, test_size=0.2, stratify=train_df['Etiqueta'], random_state=42)

train_df_split = train_df_split.rename(columns={"Etiqueta": "labels"})
val_df_split = val_df_split.rename(columns={"Etiqueta": "labels"})

train_dataset = Dataset.from_pandas(train_df_split)
val_dataset = Dataset.from_pandas(val_df_split)


tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

Map:   0%|          | 0/982 [00:00<?, ? examples/s]

Map:   0%|          | 0/246 [00:00<?, ? examples/s]

In [18]:
# Descongelar las capas del encoder
for param in model.bert.encoder.layer[:].parameters():
    param.requires_grad = True


# También asegurarse de que el clasificador final está entrenable (por si acaso)
for param in model.classifier.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,1.29979,0.630081,0.250187,0.587309
2,No log,1.752125,0.47561,0.177745,0.463836
3,No log,1.345784,0.589431,0.244844,0.566735
4,No log,1.239057,0.621951,0.251811,0.589763
5,No log,1.266186,0.621951,0.280115,0.59341
6,No log,1.401683,0.617886,0.333521,0.601356
7,No log,1.605194,0.51626,0.321975,0.527196
8,No log,1.394863,0.634146,0.305052,0.603516
9,No log,1.355601,0.630081,0.364119,0.616334
10,No log,1.531561,0.552846,0.292521,0.560105


TrainOutput(global_step=620, training_loss=0.4237736240510018, metrics={'train_runtime': 223.7154, 'train_samples_per_second': 87.79, 'train_steps_per_second': 2.771, 'total_flos': 646916666941440.0, 'train_loss': 0.4237736240510018, 'epoch': 20.0})

In [19]:
from datasets import concatenate_datasets

# Concatenar los datasets de entrenamiento y validación
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [None]:
# Asegúrate de que el modelo es el mejor modelo obtenido al final del entrenamiento
model = trainer.model

# Reajustar los parámetros de entrenamiento, si lo deseas
final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo después del entrenamiento
    metric_for_best_model="f1_macro",  # Métrica para la selección del mejor modelo
    greater_is_better=True,
    num_train_epochs=5,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  # Coseno de descenso para la tasa de aprendizaje
)

# Crear un nuevo Trainer usando el conjunto de datos combinado
final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  # Usar el conjunto combinado de train y val
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

# Entrenar el modelo con todo el conjunto de datos
final_trainer.train()

  final_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,1.129596,0.686992,0.46503,0.68015
2,No log,0.888984,0.788618,0.555043,0.77077
3,No log,0.656898,0.845528,0.699935,0.838905
4,0.448800,0.532975,0.878049,0.760695,0.874789
5,0.448800,0.523149,0.886179,0.782835,0.882717


TrainOutput(global_step=770, training_loss=0.38857118680879665, metrics={'train_runtime': 80.322, 'train_samples_per_second': 76.442, 'train_steps_per_second': 9.586, 'total_flos': 202243805245440.0, 'train_loss': 0.38857118680879665, 'epoch': 5.0})

In [21]:
test_df = pd.read_csv("data/test_afc.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"})  # Si la columna en test también se llama "Etiqueta"
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

In [22]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Asegurarse de que el modelo está en el dispositivo correcto
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Paso 1: Preparar los tensores para el DataLoader
# Convertir a tensores y evitar el warning
input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

# Comprobar si token_type_ids están presentes
if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

# Paso 2: Crear DataLoader
dataloader = DataLoader(dataset, batch_size=8)

# Paso 3: Inferencia loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        # Mover los tensores al dispositivo (GPU o CPU)
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch
            )
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch
            )

        # Obtener las predicciones
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Predecir la clase con mayor probabilidad
        predictions.extend(batch_preds)

# Paso 4: Guardar predicciones en el DataFrame
test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [23]:
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    1026
2     770
1     184
5      79
4      67
3      34
Name: count, dtype: int64


In [24]:
test_df.to_csv("afc_sbert_text.csv", index=False)