# ALBERT fallacy classification


In [1]:
from transformers import Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, predictions, average="macro")
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1": f1
    }

In [3]:
from transformers import AlbertTokenizer, AlbertModel, AlbertForSequenceClassification

model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=6) 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=256)

train_df = pd.read_csv("data/train_afc.csv")

train_df_split, val_df_split = train_test_split(train_df, test_size=0.2, stratify=train_df['Etiqueta'], random_state=42)

train_df_split = train_df_split.rename(columns={"Etiqueta": "labels"})
val_df_split = val_df_split.rename(columns={"Etiqueta": "labels"})

train_dataset = Dataset.from_pandas(train_df_split)
val_dataset = Dataset.from_pandas(val_df_split)


tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

Map:   0%|          | 0/982 [00:00<?, ? examples/s]

Map:   0%|          | 0/246 [00:00<?, ? examples/s]

In [8]:
# Congelar capas excepto las del encoder y la capa de clasificación
# Entrenar solo las últimas 2 capas del encoder
for name, param in model.named_parameters():
    if "albert.encoder.layer" in name:
        # Extraer el número de capa
        layer_number = int(name.split(".")[3])

        # Congelar todas las capas excepto las últimas 2
        if layer_number < 10:  # Congelar las primeras 10 capas
            param.requires_grad = False
        else:
            param.requires_grad = True  # Descongelar las últimas 2 capas

    # Descongelar siempre la capa de clasificación
    if "classifier" in name:
        param.requires_grad = True


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo después del entrenamiento
    metric_for_best_model="f1_macro",  # Métrica para la selección del mejor modelo
    greater_is_better=True,
    num_train_epochs=20,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="reduce_lr_on_plateau",  # Coseno de descenso para la tasa de aprendizaje
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,2.821959,0.573171,0.361123,0.576948
2,No log,2.60678,0.686992,0.381433,0.626037
3,No log,2.69457,0.597561,0.393259,0.598504
4,No log,2.943582,0.634146,0.273866,0.571958
5,0.124600,2.433045,0.642276,0.466459,0.623865
6,0.124600,3.007512,0.642276,0.336399,0.607462
7,0.124600,3.03992,0.650407,0.355248,0.61162
8,0.124600,2.737272,0.646341,0.398552,0.614192
9,0.087900,2.765269,0.638211,0.392076,0.627022
10,0.087900,2.947514,0.646341,0.324866,0.602324


TrainOutput(global_step=2460, training_loss=0.08048835924970425, metrics={'train_runtime': 543.7447, 'train_samples_per_second': 36.12, 'train_steps_per_second': 4.524, 'total_flos': 234772153098240.0, 'train_loss': 0.08048835924970425, 'epoch': 20.0})

In [9]:
from datasets import concatenate_datasets

# Concatenar los datasets de entrenamiento y validación
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [10]:
# Asegúrate de que el modelo es el mejor modelo obtenido al final del entrenamiento
model = trainer.model

# Reajustar los parámetros de entrenamiento, si lo deseas
final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Solo mantiene el mejor modelo
    load_best_model_at_end=True,  # Carga el mejor modelo después del entrenamiento
    metric_for_best_model="f1_macro",  # Métrica para la selección del mejor modelo
    greater_is_better=True,
    num_train_epochs=5,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  # Coseno de descenso para la tasa de aprendizaje
)

# Crear un nuevo Trainer usando el conjunto de datos combinado
final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  # Usar el conjunto combinado de train y val
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

# Entrenar el modelo con todo el conjunto de datos
final_trainer.train()

  final_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1
1,No log,0.580327,0.833333,0.752173,0.826667
2,No log,0.443749,0.918699,0.862963,0.91485
3,No log,0.227465,0.95935,0.933812,0.958941
4,0.290600,0.165468,0.96748,0.962241,0.967271
5,0.290600,0.157075,0.971545,0.965902,0.971468


TrainOutput(global_step=770, training_loss=0.21569697144743685, metrics={'train_runtime': 164.8006, 'train_samples_per_second': 37.257, 'train_steps_per_second': 4.672, 'total_flos': 73396182282240.0, 'train_loss': 0.21569697144743685, 'epoch': 5.0})

In [11]:
test_df = pd.read_csv("data/test_afc.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"})  # Si la columna en test también se llama "Etiqueta"
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Asegurarse de que el modelo está en el dispositivo correcto
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Paso 1: Preparar los tensores para el DataLoader
# Convertir a tensores y evitar el warning
input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

# Comprobar si token_type_ids están presentes
if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

# Paso 2: Crear DataLoader
dataloader = DataLoader(dataset, batch_size=8)

# Paso 3: Inferencia loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        # Mover los tensores al dispositivo (GPU o CPU)
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch
            )
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch
            )

        # Obtener las predicciones
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Predecir la clase con mayor probabilidad
        predictions.extend(batch_preds)

# Paso 4: Guardar predicciones en el DataFrame
test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [13]:
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    1528
2     287
1     253
5      35
3      32
4      25
Name: count, dtype: int64


In [14]:
test_df.to_csv("afc_albert_text.csv", index=False)