# SBERT fallacy detection


In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch
from torch import nn

In [27]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    f1_class_0 = f1_score(labels, predictions, pos_label=0, average="binary")
    f1_class_1 = f1_score(labels, predictions, pos_label=1, average="binary")
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "f1_class_0": f1_class_0,
        "f1_class_1": f1_class_1}

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "microsoft/MiniLM-L12-H384-uncased"
base_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from c

In [None]:
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/train_afd.csv")

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["Etiqueta"], random_state=42)
train_df = train_df.rename(columns={"Etiqueta": "labels"})
val_df = val_df.rename(columns={"Etiqueta": "labels"})

def tokenize_function(examples):
    return tokenizer(examples["Texto"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

In [None]:
class MiniLMClassifier(nn.Module):
    def __init__(self, base_model, hidden_size=384, num_labels=2):
        super(MiniLMClassifier, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        #[CLS]
        cls_output = outputs.last_hidden_state[:, 0, :]  
        logits = self.classifier(cls_output)

        if labels is not None:
            labels = labels.to(torch.long)  
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        else:
            loss = None

        return {"loss": loss, "logits": logits}

In [None]:
for name, param in base_model.named_parameters():
    if "encoder.layer" in name:
        layer_num = int(name.split("encoder.layer.")[-1].split(".")[0])
        param.requires_grad = layer_num >= 8  

    else:
        param.requires_grad = False  

model = MiniLMClassifier(base_model)

# === TRAINING ===
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `MiniLMClassifier.forward` and have been ignored: Texto, token_type_ids, __index_level_0__. If Texto, token_type_ids, __index_level_0__ are not expected by `MiniLMClassifier.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13694
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4280
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,No log,0.267233,0.908879,0.867718,0.952191,0.031056
2,0.292800,0.274737,0.908586,0.865921,0.952075,0.012618
3,0.267500,0.272336,0.903621,0.888355,0.948276,0.294872
4,0.253200,0.268146,0.911507,0.883385,0.95319,0.192
5,0.240300,0.270145,0.910047,0.887331,0.952174,0.245098
6,0.231500,0.270249,0.90771,0.887886,0.950763,0.265116
7,0.231500,0.278492,0.899825,0.886433,0.946078,0.295688
8,0.225500,0.278751,0.903037,0.885776,0.948076,0.268722
9,0.217000,0.280755,0.901869,0.884962,0.947418,0.266376
10,0.210300,0.280928,0.901285,0.885379,0.947039,0.274678


The following columns in the evaluation set  don't have a corresponding argument in `MiniLMClassifier.forward` and have been ignored: Texto, token_type_ids, __index_level_0__. If Texto, token_type_ids, __index_level_0__ are not expected by `MiniLMClassifier.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3424
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-428
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results/checkpoint-428/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-428/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `MiniLMClassifier.forward` and have been ignored: Texto, token_type_ids, __index_level_0__. If Texto, token_type_ids, __index_level_0__ are not expected by `MiniLMClassifier.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exam

TrainOutput(global_step=4280, training_loss=0.2402760915667097, metrics={'train_runtime': 615.2988, 'train_samples_per_second': 222.559, 'train_steps_per_second': 6.956, 'total_flos': 0.0, 'train_loss': 0.2402760915667097, 'epoch': 10.0})

In [None]:
from datasets import concatenate_datasets

# === TRAIN WITH ALL THE DATASET ===
full_train_dataset = concatenate_datasets([train_dataset, val_dataset])

In [None]:
model = trainer.model

final_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  
    load_best_model_at_end=True,  
    metric_for_best_model="f1", 
    greater_is_better=True,
    num_train_epochs=3,
    weight_decay=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  
)

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=full_train_dataset,  
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset = val_dataset
)

final_trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `MiniLMClassifier.forward` and have been ignored: Texto, token_type_ids, __index_level_0__. If Texto, token_type_ids, __index_level_0__ are not expected by `MiniLMClassifier.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17118
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6420
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Class 0,F1 Class 1
1,0.2656,0.257076,0.915888,0.897269,0.955154,0.323944
2,0.2559,0.24907,0.923773,0.904662,0.959466,0.361858
3,0.2613,0.247684,0.922605,0.905086,0.958755,0.373522


The following columns in the evaluation set  don't have a corresponding argument in `MiniLMClassifier.forward` and have been ignored: Texto, token_type_ids, __index_level_0__. If Texto, token_type_ids, __index_level_0__ are not expected by `MiniLMClassifier.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3424
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2140
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./results/checkpoint-2140/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2140/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-4280] due to args.save_total_limit
Deleting older checkpoint [results/checkpoint-6420] due to args.save_total_limit
Deleting older checkpoint [results/checkpoint-428] due to args.save_total_limit
Deleting older checkpoint [results/checkpoint-856] due to args.save_total_limit
Deleting older checkpo

TrainOutput(global_step=6420, training_loss=0.2613012450506383, metrics={'train_runtime': 280.8095, 'train_samples_per_second': 182.878, 'train_steps_per_second': 22.862, 'total_flos': 0.0, 'train_loss': 0.2613012450506383, 'epoch': 3.0})

In [None]:
# === TEST PREPROCESSING ===
test_df = pd.read_csv("/content/drive/MyDrive/UPV master/HAIA/test_afd.csv")
test_df = test_df.rename(columns={"Etiqueta": "labels"})  
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function)
tokenized_test.set_format("torch")

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()

if 'token_type_ids' in tokenized_test:
    token_type_ids = torch.tensor(tokenized_test['token_type_ids']).clone().detach()
    dataset = TensorDataset(input_ids, attention_mask, token_type_ids)
else:
    dataset = TensorDataset(input_ids, attention_mask)


dataloader = DataLoader(dataset, batch_size=8)

# === PREDICTION ===
model.eval()
predictions = []
with torch.no_grad():
    for batch in dataloader:
        if 'token_type_ids' in batch:
            input_ids_batch, attention_mask_batch, token_type_ids_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                token_type_ids=token_type_ids_batch)
        else:
            input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
            outputs = model(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch)

        logits = outputs['logits']  
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()  
        predictions.extend(batch_preds)

test_df['predicted_label'] = predictions

  input_ids = torch.tensor(tokenized_test['input_ids']).clone().detach()
  attention_mask = torch.tensor(tokenized_test['attention_mask']).clone().detach()


In [37]:
test_df

Unnamed: 0,Texto,labels,predicted_label
0,We got to take a look at what I was left when ...,,0
1,We had an economy that was in free fall.,,0
2,The pandemic was so badly handled.,,0
3,Many people were dying.,,0
4,"All he said was, it's not that serious.",,0
...,...,...,...
2170,She gave a lot of it away to the Taliban.,,0
2171,She gave it to Afghanistan.,,0
2172,What these people have done to our country and...,,0
2173,Many of them are criminals and they're destroy...,,1


In [None]:
label_counts = test_df['predicted_label'].value_counts()
print(label_counts)

predicted_label
0    1996
1     179
Name: count, dtype: int64


In [39]:
test_df.to_csv("predicciones.csv", index=False)