# Fin-tuning BERT

This notebook fine-tunes the multilingual BERT model.

For higher speed of processing, it is suggested to run this notebook in Google Colab.

This step utilises code from the following GitHub repository: [bert4srl](https://github.com/angel-daza/bert4srl)

In [2]:
! apt install python3.10-venv

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  python3-pip-whl python3-setuptools-whl
The following NEW packages will be installed:
  python3-pip-whl python3-setuptools-whl python3.10-venv
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 2,473 kB of archives.
After this operation, 2,884 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-pip-whl all 22.0.2+dfsg-1ubuntu0.4 [1,680 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-setuptools-whl all 59.6.0-1.2ubuntu0.22.04.1 [788 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3.10-venv amd64 3.10.12-1~22.04.3 [5,716 B]
Fetched 2,473 kB in 3s (989 kB/s)
Selecting previously unselected package python3-pip-whl.
(Reading database ... 121925 files and directories currently installed.)
Prep

In [3]:
! python -m venv myvenve
! source myvenve/bin/activate

In [4]:
%%writefile requirements.txt
keras==2.8.0
seqeval==1.2.2
tabulate==0.8.9
tensorflow==2.8.4
torch==1.11.0
transformers==4.17.0

Writing requirements.txt


In [5]:
! pip install -r requirements.txt

Collecting keras==2.8.0 (from -r requirements.txt (line 1))
  Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m0.9/1.4 MB[0m [31m28.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval==1.2.2 (from -r requirements.txt (line 2))
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tabulate==0.8.9 (from -r requirements.txt (line 3))
  Downloading tabulate-0.8.9-py3-none-any.whl (2

In [6]:
"""
    This BERT training code is based on the script here: https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    We adapted it for the TokenClassification task, specifically for Named Entity Recognition.
"""

from typing import List, Dict, Tuple
import random, time, os
import torch
from torch.nn import CrossEntropyLoss
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, AutoModelForTokenClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import logging, sys, argparse

In [7]:
! pip install seqeval



In [9]:
# Our code behind the scenes!
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/bert4srl-master')

import utils_srl

Mounted at /content/drive


In [10]:
# Set arguments

class Args:
    train_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/train_by_sentence.jsonl'
    dev_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/annotation_dev_processed.jsonl'
    save_model_dir = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/saved_models_tuned/TRIAL_BERT_SRL'
    bert_model = "bert-base-multilingual-cased"
    recover_epoch = None
    gpu = 0
    seed_val = 1373
    epochs = 10
    batch_size = 16
    info_every = 100
    max_len = 256
    learning_rate = 3e-5
    gradient_clip = 1.0

args = Args()

# Initialise the params
START_EPOCH = int(args.recover_epoch) if args.recover_epoch else 0
RECOVER_CHECKPOINT = bool(args.recover_epoch)
EPOCHS = args.epochs
BERT_MODEL_NAME = args.bert_model
DO_LOWERCASE = False
GPU_RUN_IX = args.gpu

SEED_VAL = args.seed_val
SEQ_MAX_LEN = args.max_len
PRINT_INFO_EVERY = args.info_every
GRADIENT_CLIP = args.gradient_clip
LEARNING_RATE = args.learning_rate
BATCH_SIZE = args.batch_size

TRAIN_DATA_PATH = args.train_path
DEV_DATA_PATH = args.dev_path
MODEL_DIR = args.save_model_dir
LOSS_FILENAME = f"{MODEL_DIR}/Losses_{START_EPOCH}_{EPOCHS}.json"
LABELS_FILENAME = f"{MODEL_DIR}/label2index.json"

PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index # -100

if not os.path.exists(args.save_model_dir):
    os.makedirs(args.save_model_dir)


In [11]:
# =====================================================================================
#                    LOGGING INFO ...
# =====================================================================================
console_hdlr = logging.StreamHandler(sys.stdout)
file_hdlr = logging.FileHandler(filename=f"{MODEL_DIR}/BERT_TokenClassifier_train_{START_EPOCH}_{EPOCHS}.log")
logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
print("Start Logging")
print(args)

# Initialize Random seeds and validate if there's a GPU available...
device, USE_CUDA = utils_srl.get_torch_device(GPU_RUN_IX)
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

Start Logging
<__main__.Args object at 0x7b5a5911ecb0>
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [12]:
# ==========================================================================================
#               LOAD TRAIN & DEV DATASETS
# ==========================================================================================
# Initialize Tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=DO_LOWERCASE, do_basic_tokenize=False)
# Load Train Dataset
train_label2index, train_inputs, train_masks, train_labels, train_lens, train_preds = utils_srl.load_srl_dataset(TRAIN_DATA_PATH,
                                                                                                              tokenizer,
                                                                                                              max_len=SEQ_MAX_LEN,
                                                                                                              include_labels=True,
                                                                                                              label2index=None)
utils_srl.save_label_dict(train_label2index, filename=LABELS_FILENAME)
index2label = {v: k for k, v in train_label2index.items()}

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_preds)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

# Load Dev Dataset
if DEV_DATA_PATH:
    _, dev_inputs, dev_masks, dev_labels, dev_lens, dev_preds = utils_srl.load_srl_dataset(DEV_DATA_PATH, tokenizer,
                                                                                    max_len=SEQ_MAX_LEN,
                                                                                    include_labels=True,
                                                                                    label2index=train_label2index)
    # Create the DataLoader for our Development set.
    dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels, dev_preds)
    dev_sampler = RandomSampler(dev_data)
    dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=BATCH_SIZE)


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [13]:
# ==========================================================================================
#              LOAD MODEL & OPTIMIZER
# ==========================================================================================
if RECOVER_CHECKPOINT:
    model, tokenizer = utils_srl.load_model(AutoModelForTokenClassification, BertTokenizer, f"{MODEL_DIR}/EPOCH_{START_EPOCH}")
else:
    model = AutoModelForTokenClassification.from_pretrained(BERT_MODEL_NAME, num_labels=len(train_label2index))
    model.config.finetuning_task = 'token-classification'
    model.config.id2label = index2label
    model.config.label2id = train_label2index
if USE_CUDA: model.cuda()

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create optimizer and the learning rate scheduler.
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [14]:
# ==========================================================================================
#                          TRAINING ...
# ==========================================================================================
# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(START_EPOCH+1, EPOCHS+1):
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i, EPOCHS))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_predicates = batch[3].to(device)

        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, token_type_ids=b_predicates, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)

        # Update parameters
        optimizer.step()
        scheduler.step()

        # Progress update
        if step % PRINT_INFO_EVERY == 0 and step != 0:
            # Calculate elapsed time in minutes.
            elapsed = utils_srl.format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.    Loss: {}.'.format(step, len(train_dataloader),
                                                                                            elapsed, loss.item()))

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.4f}".format(avg_train_loss))
    print("  Training Epoch took: {:}".format(utils_srl.format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================
    if DEV_DATA_PATH:
        # After the completion of each training epoch, measure our performance on
        # our validation set.
        t0 = time.time()
        results, preds_list = utils_srl.evaluate_bert_model(dev_dataloader, BATCH_SIZE, model, tokenizer, index2label, PAD_TOKEN_LABEL_ID, prefix="Validation Set")
        print("  Validation Loss: {0:.2f}".format(results['loss']))
        print("  Precision: {0:.2f} || Recall: {1:.2f} || F1: {2:.2f}".format(results['precision']*100, results['recall']*100, results['f1']*100))
        print("  Validation took: {:}".format(utils_srl.format_time(time.time() - t0)))


    # ================================================
    #               Save Checkpoint for this Epoch
    # ================================================
    utils_srl.save_model(f"{MODEL_DIR}/EPOCH_{epoch_i}", {"args":[]}, model, tokenizer)

utils_srl.save_losses(loss_values, filename=LOSS_FILENAME)

print("")
print("Training complete!")



Training...
  Batch   100  of    257.    Elapsed: 0:01:18.    Loss: 0.02754659578204155.
  Batch   200  of    257.    Elapsed: 0:02:37.    Loss: 0.024045070633292198.

  Average training loss: 0.0616
  Training Epoch took: 0:03:21




  Validation Loss: 0.02
  Precision: 74.35 || Recall: 67.54 || F1: 70.78
  Validation took: 0:00:09
Saving model to /content/drive/MyDrive/Colab Notebooks/bert4srl-master/saved_models_tuned/TRIAL_BERT_SRL/EPOCH_1

Training...
  Batch   100  of    257.    Elapsed: 0:01:20.    Loss: 0.021825041621923447.
  Batch   200  of    257.    Elapsed: 0:02:40.    Loss: 0.011068419553339481.

  Average training loss: 0.0164
  Training Epoch took: 0:03:24
  Validation Loss: 0.02
  Precision: 74.02 || Recall: 71.15 || F1: 72.56
  Validation took: 0:00:09
Saving model to /content/drive/MyDrive/Colab Notebooks/bert4srl-master/saved_models_tuned/TRIAL_BERT_SRL/EPOCH_2

Training...
  Batch   100  of    257.    Elapsed: 0:01:20.    Loss: 0.019524484872817993.
  Batch   200  of    257.    Elapsed: 0:02:40.    Loss: 0.01869874820113182.

  Average training loss: 0.0130
  Training Epoch took: 0:03:24
  Validation Loss: 0.03
  Precision: 71.50 || Recall: 70.90 || F1: 71.20
  Validation took: 0:00:09
Saving mo