In [2]:
from ast import literal_eval
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset, load_metric, ClassLabel, Dataset, Features, Sequence, Value
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

# Load Dataset

In [5]:
label_names = ["O", "B"]

In [6]:
data_path = "../data/fine-tuning/"

data_files = {"train": data_path + "mwb-texts_NER_train.tsv", "test": data_path + "mwb-texts_NER_test.tsv"}

label_names = ['O', 'B']
features = Features({"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=label_names))})
dataset = load_dataset("csv", data_files=data_files, features=features, delimiter="\t", converters={0: literal_eval, 1: literal_eval})

dataset

Generating train split: 0 examples [00:00, ? examples/s]

  for batch_idx, df in enumerate(csv_file_reader):
  for batch_idx, df in enumerate(csv_file_reader):


Generating test split: 0 examples [00:00, ? examples/s]

  for batch_idx, df in enumerate(csv_file_reader):
  for batch_idx, df in enumerate(csv_file_reader):


DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 600
    })
})

In [7]:
dataset["train"][0]

{'tokens': ['sulen',
  'ouch',
  'Glorîône',
  'und',
  'dem',
  'stolzen',
  'Fausabrê',
  'und',
  'dem',
  'künige',
  'Tampastê',
  'und',
  'dem',
  'herzogen',
  'Môrant',
  'benennen,',
  'daz',
  'der',
  'sehser',
  'hant',
  'vil'],
 'labels': [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]}

# Init Tokenizer from Base Model

In [8]:
base_model = "deepset/gbert-base"

tokenizer = AutoTokenizer.from_pretrained(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

# Fine-tune Model

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
})

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='deepset/gbert-base', vocab_size=31102, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [12]:
metric = load_metric('seqeval')

def compute_metrics(p):

    predictions, labels = p

    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric('seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [13]:
# init model
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in enumerate(label_names)}

model = AutoModelForTokenClassification.from_pretrained(
    base_model, num_labels=len(label_names), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized

In [14]:
tokenized_dataset = tokenized_dataset.remove_columns(['tokens'])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
})

In [15]:
model_path = "drive/MyDrive/Colab Notebooks/models/gbert-base-mwb-NER"

training_args = TrainingArguments(
    output_dir=model_path + "_trainer",
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=.01,
    seed=0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [16]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.025192,0.916399,0.966102,0.940594,0.991427
2,No log,0.020001,0.958333,0.961582,0.959955,0.994364
3,No log,0.019574,0.958097,0.955932,0.957014,0.993967


TrainOutput(global_step=450, training_loss=0.026549621158176, metrics={'train_runtime': 111.2685, 'train_samples_per_second': 64.708, 'train_steps_per_second': 4.044, 'total_flos': 218713550911680.0, 'train_loss': 0.026549621158176, 'epoch': 3.0})

In [17]:
trainer.save_model(model_path)

# Inspect Fine-tuned Model

In [18]:
print(model.config)
model

BertConfig {
  "_name_or_path": "deepset/gbert-base",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B": 1,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}



BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el