# Fine-tuning XLM RoBERTa for gender bias detection in Spanish

In [None]:
!pip install datasets numpy pandas transformers[torch] wandb

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import Dataset, DatasetDict, load_metric
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DefaultDataCollator,
    TrainingArguments,
    EarlyStoppingCallback,
    Trainer,
)
import wandb

In [None]:
# Get model and tokenizer as global variables
model_name = "FacebookAI/xlm-roberta-base"
project_name = "XLM-RoBERTa-Gender-Bias-Detection"
run_name = f"final-run-finetuning"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
if run_name is not None:
  wandb.init(project=project_name, name=run_name)
else:
  wandb.init(project=project_name)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▃▆▇█▁
eval/f1,▁███▇
eval/loss,▁▁▁▂█
eval/precision,▁▇██▇
eval/recall,▁█▆▇█
eval/runtime,█▃▄▂▁
eval/samples_per_second,▁▅▅▇█
eval/steps_per_second,▁▅▅▇█
train/epoch,▁▃▅▆██
train/global_step,▁▃▅▆██

0,1
eval/accuracy,0.8125
eval/f1,0.71922
eval/loss,0.82626
eval/precision,0.69799
eval/recall,0.76535
eval/runtime,1.7212
eval/samples_per_second,167.324
eval/steps_per_second,10.458
train/epoch,5.0
train/global_step,420.0


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", max_length=100, truncation=True
    )

In [None]:
def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(
        predictions=predictions, references=labels, average="macro"
    )["precision"]
    recall = metric2.compute(
        predictions=predictions, references=labels, average="macro"
    )["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")[
        "f1"
    ]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


In [None]:
# Loading data
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")
valid_data = pd.read_csv("val_data.csv")

In [None]:
train_data = train_data.rename(columns={"Contents": "text"})
test_data = test_data.rename(columns={"Contents": "text"})
valid_data = valid_data.rename(columns={"Contents": "text"})

train_data = train_data.rename(columns={"Label": "label"})
test_data = test_data.rename(columns={"Label": "label"})
valid_data = valid_data.rename(columns={"Label": "label"})

# Drop all columns that are not "text" and "label"
train_data = train_data[["text", "label"]]
test_data = test_data[["text", "label"]]
valid_data = valid_data[["text", "label"]]

# Create a dataset dictionary
dataset_dict = {
    "train": Dataset.from_pandas(train_data),
    "test": Dataset.from_pandas(test_data),
    "valid": Dataset.from_pandas(valid_data),
}

# Turn it into HuggingFace dataset
dataset = DatasetDict(dataset_dict)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1339
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 287
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 288
    })
})

In [None]:
# Tokenize the data
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset["train"][0]


Map:   0%|          | 0/1339 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

{'text': '@marioem_95 aún le queda un poquillo, pero por supuesto. aunque tengo lista de rivales pendientes ',
 'label': 0,
 'input_ids': [0,
  1374,
  39,
  6723,
  195,
  454,
  8821,
  37419,
  95,
  20856,
  51,
  160,
  85205,
  365,
  4,
  1788,
  196,
  85151,
  5,
  18953,
  30412,
  5875,
  8,
  43876,
  90,
  170924,
  7,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [None]:
# Defining the Data Collator
data_collator = DefaultDataCollator()

 # Training
training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="wandb",
    metric_for_best_model="eval_f1",
    load_best_model_at_end=True,
    fp16=True,
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=3, early_stopping_threshold=0.00
            )
        ],
    )

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.436771,0.414931,0.5,0.45351,0.829861
2,No log,0.374122,0.922261,0.55102,0.550447,0.847222
3,No log,0.384498,0.728111,0.759841,0.741745,0.84375
4,No log,0.380354,0.728007,0.680813,0.699288,0.847222
5,No log,0.4601,0.711096,0.737341,0.722521,0.833333
6,0.328400,0.496984,0.735882,0.739689,0.737755,0.850694


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory output/checkpoint-84 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=504, training_loss=0.3271509738905089, metrics={'train_runtime': 276.9663, 'train_samples_per_second': 48.345, 'train_steps_per_second': 3.033, 'total_flos': 412858245852000.0, 'train_loss': 0.3271509738905089, 'epoch': 6.0})

In [None]:
trainer.evaluate(tokenized_dataset["test"])