# BERT fallacy detection


In [1]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    f1_class_0 = f1_score(labels, predictions, pos_label=0, average="binary")
    f1_class_1 = f1_score(labels, predictions, pos_label=1, average="binary")
    accuracy = accuracy_score(labels, predictions)

    return {"accuracy": accuracy,
            "f1": f1,
            "f1_class_0": f1_class_0,
            "f1_class_1": f1_class_1}

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=128)

train_df = pd.read_csv("data/train_afd.csv")

train_df_split, val_df_split = train_test_split(train_df, test_size=0.2, stratify=train_df['Etiqueta'], random_state=42)

train_df_split = train_df_split.rename(columns={"Etiqueta": "labels"})
val_df_split = val_df_split.rename(columns={"Etiqueta": "labels"})

train_df_split["labels"] = train_df_split["labels"].astype(int)
val_df_split["labels"] = val_df_split["labels"].astype(int)

train_dataset = Dataset.from_pandas(train_df_split)
val_dataset = Dataset.from_pandas(val_df_split)


tokenized_train = train_dataset.map(tokenize_function)
tokenized_valid = val_dataset.map(tokenize_function)
tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_valid.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

model_name = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

for name, param in model.named_parameters():
    if "encoder.layer" in name:
        layer_number = int(name.split(".")[3])

        if layer_number < 23: 
            param.requires_grad = False
        else:
            param.requires_grad = True  

    if "classifier" in name:
        param.requires_grad = True

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class_weights = torch.tensor([0.2, 0.8]).to(model.device) 


def custom_loss_fn(outputs, labels):
    loss_fct = CrossEntropyLoss(weight=class_weights)
    return loss_fct(outputs.logits, labels)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss_fn(outputs, labels)
        return (loss, outputs) if return_outputs else loss
  
    
# === TRAINING === 
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  
    load_best_model_at_end=True, 
    metric_for_best_model="f1", 
    greater_is_better=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",)


trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.6152,0.50173,0.852512,0.852617,0.918797,0.197138
2,0.5969,0.8603,0.909171,0.867329,0.952366,0.025078
3,0.6348,0.658382,0.906542,0.883942,0.950264,0.227053
4,0.5641,0.709165,0.899825,0.885156,0.946162,0.280922
5,0.5516,0.795564,0.904206,0.886308,0.94875,0.267857
6,0.5389,0.830487,0.893984,0.883532,0.942663,0.297872
7,0.4743,0.81232,0.901285,0.884834,0.947072,0.268398
8,0.4648,0.834349,0.890187,0.882005,0.940393,0.303704
9,0.4203,0.884464,0.895736,0.883564,0.943753,0.287425
10,0.467,0.886983,0.89632,0.883972,0.944086,0.288577


TrainOutput(global_step=17120, training_loss=0.5342671256199061, metrics={'train_runtime': 3908.3696, 'train_samples_per_second': 35.038, 'train_steps_per_second': 4.38, 'total_flos': 3.190467022353408e+16, 'train_loss': 0.5342671256199061, 'epoch': 10.0})

In [None]:
from datasets import concatenate_datasets

# === TRAIN WITH ALL THE DATASET ===
full_train_dataset = concatenate_datasets([tokenized_train, tokenized_valid])

In [None]:
model = trainer.model

final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  
    load_best_model_at_end=True,  
    metric_for_best_model="f1_class_1",  
    greater_is_better=True,
    num_train_epochs=2,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  
)

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = tokenized_valid
)

final_trainer.train()

  final_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.267,0.291142,0.912091,0.896568,0.952917,0.338462
2,0.2283,0.289348,0.918224,0.901954,0.956305,0.363636


TrainOutput(global_step=4280, training_loss=0.2524292366526951, metrics={'train_runtime': 1004.5463, 'train_samples_per_second': 34.081, 'train_steps_per_second': 4.261, 'total_flos': 7976400538724352.0, 'train_loss': 0.2524292366526951, 'epoch': 2.0})

In [None]:
# === TEST PREPROCESSING ===
test_df = pd.read_csv("data/test_afd.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"}) 
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)

dataloader = DataLoader(dataset, batch_size=8)

# === PREDICTION ===
model.eval()
predictions = []
with torch.no_grad():
    for batch in dataloader:
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch)
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch)


        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Predecir la clase con mayor probabilidad
        predictions.extend(batch_preds)

test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [13]:
test_df

Unnamed: 0,Texto,labels,predicted_label
0,We got to take a look at what I was left when ...,,0
1,We had an economy that was in free fall.,,0
2,The pandemic was so badly handled.,,0
3,Many people were dying.,,0
4,"All he said was, it's not that serious.",,0
...,...,...,...
2170,She gave a lot of it away to the Taliban.,,0
2171,She gave it to Afghanistan.,,0
2172,What these people have done to our country and...,,0
2173,Many of them are criminals and they're destroy...,,0


In [None]:
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    2023
1     152
Name: count, dtype: int64


In [None]:
test_df.to_csv("afd_bert_text.csv", index=False)