# ALBERT fallacy detection


In [1]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    f1_class_0 = f1_score(labels, predictions, pos_label=0, average="binary")
    f1_class_1 = f1_score(labels, predictions, pos_label=1, average="binary")
    accuracy = accuracy_score(labels, predictions)

    return {"accuracy": accuracy,
            "f1": f1,
            "f1_class_0": f1_class_0,
            "f1_class_1": f1_class_1}

In [4]:
from transformers import AlbertTokenizer, AlbertModel

model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Ajusta num_labels según tu tarea

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=128)

train_df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/train_afd.csv")

train_df_split, val_df_split = train_test_split(train_df, test_size=0.2, stratify=train_df['Etiqueta'], random_state=42)

train_df_split = train_df_split.rename(columns={"Etiqueta": "labels"})
val_df_split = val_df_split.rename(columns={"Etiqueta": "labels"})

train_df_split["labels"] = train_df_split["labels"].astype(int)
val_df_split["labels"] = val_df_split["labels"].astype(int)

train_dataset = Dataset.from_pandas(train_df_split)
val_dataset = Dataset.from_pandas(val_df_split)


tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_valid.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

In [None]:
# === TRAINING ===
for name, param in model.named_parameters():
    if "albert.encoder.layer" in name:
        layer_number = int(name.split(".")[3])
        if layer_number < 10: 
            param.requires_grad = False
        else:
            param.requires_grad = True 

    if "classifier" in name:
        param.requires_grad = True


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  
    load_best_model_at_end=True, 
    metric_for_best_model="f1",  
    greater_is_better=True,
    num_train_epochs=10,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13694
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 17120
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchust2902[0m ([33mchust2902-politechnical-university-valencia[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.3555,0.322785,0.908294,0.864645,0.951944,0.0
2,0.3448,0.377688,0.908294,0.864645,0.951944,0.0
3,0.3032,0.297534,0.88493,0.860952,0.938572,0.092166
4,0.31,0.435531,0.90625,0.879185,0.950302,0.174807
5,0.2962,0.444334,0.8934,0.879414,0.942601,0.253579
6,0.2216,0.506096,0.900409,0.883109,0.946644,0.253829
7,0.167,0.718569,0.882301,0.8759,0.935899,0.28164
8,0.0898,0.88009,0.880257,0.872277,0.934921,0.251825
9,0.0472,0.910597,0.884638,0.873767,0.937569,0.241843
10,0.021,0.911671,0.884638,0.873767,0.937569,0.241843


The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3424
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1712
Configuration saved in ./results/checkpoint-1712/config.json
Model weights saved in ./results/checkpoint-1712/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1712/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1712/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore

TrainOutput(global_step=17120, training_loss=0.21379801440183246, metrics={'train_runtime': 3422.8486, 'train_samples_per_second': 40.008, 'train_steps_per_second': 5.002, 'total_flos': 818151513753600.0, 'train_loss': 0.21379801440183246, 'epoch': 10.0})

In [None]:
from datasets import concatenate_datasets

# === TRAIN WITH ALL THE DATASET ===
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [None]:
model = trainer.model
final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  
    load_best_model_at_end=True, 
    metric_for_best_model="f1", 
    greater_is_better=True,
    num_train_epochs=2,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
)

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

final_trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17118
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4280
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.2475,0.335319,0.925526,0.912589,0.9601,0.442013
2,0.1747,0.258241,0.948306,0.943656,0.972007,0.662857


The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3424
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2140
Configuration saved in ./results/checkpoint-2140/config.json
Model weights saved in ./results/checkpoint-2140/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2140/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2140/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-10272] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: Texto, __index_level_0__. If Texto, __index_level_0__ are

TrainOutput(global_step=4280, training_loss=0.22354902374410185, metrics={'train_runtime': 843.5083, 'train_samples_per_second': 40.588, 'train_steps_per_second': 5.074, 'total_flos': 204543852963840.0, 'train_loss': 0.22354902374410185, 'epoch': 2.0})

In [None]:
# === TEST PREPROCESSING ===
test_df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/test_afd.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"})
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

dataloader = DataLoader(dataset, batch_size=8)


# === PREDICTION ===
model.eval()
predictions = []
with torch.no_grad():
    for batch in dataloader:
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch)
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch)

        # Predictions
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy() 
        predictions.extend(batch_preds)

test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [12]:
test_df

Unnamed: 0,Texto,labels,predicted_label
0,We got to take a look at what I was left when ...,,0
1,We had an economy that was in free fall.,,0
2,The pandemic was so badly handled.,,0
3,Many people were dying.,,0
4,"All he said was, it's not that serious.",,0
...,...,...,...
2170,She gave a lot of it away to the Taliban.,,0
2171,She gave it to Afghanistan.,,0
2172,What these people have done to our country and...,,0
2173,Many of them are criminals and they're destroy...,,0


In [None]:
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    1913
1     262
Name: count, dtype: int64


In [14]:
test_df.to_csv("predicciones.csv", index=False)