# Fine-tuning BETO

Importing libraries.

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification, Trainer
from sklearn.preprocessing import LabelEncoder
import warnings, evaluate, pickle, json
from tqdm import tqdm
import numpy as np
warnings.filterwarnings("ignore")

from datasets import disable_caching
disable_caching()

  from .autonotebook import tqdm as notebook_tqdm
2024-05-24 16:53:16.259792: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-24 16:53:16.309757: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Dataset tokenization

We load the dataset using the `datasets` library function `load_dataset`. This is the format used by Hugging Face.

In [2]:
dataset = load_dataset("json", data_files={"train": "data/train_data.json", "validation": "data/val_data.json", "test": "data/test_data.json"})

Using the model id we can load both model and tokenizer. Since we have a special token `@PADDING`, we shall add it to the tokenizer.

In [3]:
model_id = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_tokens(new_tokens = ["@@PADDING@@"])

1

Let us quickly check how the tokenizer transform our words. Two things stand out:

- It adds special tokens to the beginning and end of the sentence, `[CLS]` and `[SEP]`. We will have to let the model know that these are special tokens.
- It breaks some words, indicating this by two hashes at the beginning of the token, for example, ##subword. When rearranging labels, we'll have to be sure to label only the first word of the decomposition.

In [4]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["modified_words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '@@PADDING@@',
 'Aun',
 'así',
 'no',
 'hemos',
 'mi',
 'favorito',
 'de',
 'los',
 'poca',
 'que',
 'AN',
 '##TE',
 'el',
 'momento',
 'SER',
 '##Á',
 '##N',
 'podido',
 'escuchar',
 'de',
 'Pe',
 '##er',
 '##G',
 '##yn',
 '##t',
 'Lobo',
 '##gri',
 '##s',
 'fuimos',
 '[SEP]']

The next function tokenizes our phrase, and reassign its labels accordingly. Particularly, it will label as -100 both [CLS] and [SEP], and it will only label the first word of a decomposition, telling the model to ignore the rest with the label -100.

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["modified_words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

We map our dataset with this function.

In [6]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|████████████████████| 819832/819832 [01:06<00:00, 12329.88 examples/s]
Map: 100%|████████████████████| 234237/234237 [00:19<00:00, 12140.06 examples/s]
Map: 100%|████████████████████| 117120/117120 [00:09<00:00, 12267.51 examples/s]


With it, we can define the `data_collator` for training.

In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

With our previously saved `LabelEncoder` we will define dictionaries `id2label` and `label2id` to jump between our integer representations and our labels. This info will also be passed out to the model configuration.

In [10]:
with open("data/labelencoder.pkl","rb") as f:
    le = pickle.load(f)

id2label = {i: le.classes_[i] for i in range(len(le.classes_))}
label2id = {id2label[j]: j for j in range(len(id2label))}

We load the model, also rezising it accordingly to the tokenizer, since we added one more token.

In [11]:
model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=len(le.classes_), id2label=id2label, label2id=label2id)
model.resize_token_embeddings(len(tokenizer))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(31003, 768)

We will be computing metrics with the help of the `seqeval` library. We also define a function to preprocess the logits for the metrics calculation, this will help out optimizing the training and evaluation loop.

In [12]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    
    predictions = p.predictions
    labels = p.label_ids
    
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = np.argmax(logits.cpu(), axis=2)
    return pred_ids

Now, the `TrainingArguments` definition. We shall train for 3 epochs, with a precision of `fp16`. We'll only save the best model based on the `f1-score`.

In [13]:
training_args = TrainingArguments(
    output_dir="test_model_bert",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2375,0.210232,0.875939,0.832718,0.853782,0.946279
2,0.1756,0.18333,0.886929,0.855005,0.870674,0.952287
3,0.1294,0.181506,0.888872,0.863606,0.876057,0.954033


TrainOutput(global_step=76860, training_loss=0.22168434643354573, metrics={'train_runtime': 15090.5712, 'train_samples_per_second': 162.982, 'train_steps_per_second': 5.093, 'total_flos': 1.21867179207792e+17, 'train_loss': 0.22168434643354573, 'epoch': 3.0})

Then, we'll save the best model.

In [14]:
trainer.save_model("bert_ner_model")