## Dataset preparation

In [1]:
from datasets import load_dataset

dataset = load_dataset('conll2003')  # download a dataset

# now you can preview a few examples
dataset['test'].to_pandas().head(5)

Found cached dataset conll2003 (/home/emukans/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]"
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [2]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
label_list = dataset["train"].features[f"pos_tags"].feature.names

In [4]:
example = dataset["train"][4]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"pos_tags"][i] for i in word_ids]
print(aligned_labels, tokenized_input["input_ids"])
print(len(aligned_labels), len(tokenized_input["input_ids"]))

['[CLS]', 'germany', "'", 's', 'representative', 'to', 'the', 'european', 'union', "'", 's', 'veterinary', 'committee', 'werner', 'z', '##wing', '##mann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheep', '##me', '##at', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '[SEP]']
[-100, 22, 27, 27, 21, 35, 12, 22, 22, 27, 27, 16, 21, 22, 22, 22, 22, 38, 15, 22, 24, 20, 37, 21, 21, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7, -100] [101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]
39 39


In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenize_and_align_labels(dataset['train'][:5])

{'input_ids': [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], [101, 2848, 13934, 102], [101, 9371, 2727, 1011, 5511, 1011, 2570, 102], [101, 1996, 2647, 3222, 2056, 2006, 9432, 2009, 18335, 2007, 2446, 6040, 2000, 10390, 2000, 18454, 2078, 2329, 12559, 2127, 6529, 5646, 3251, 5506, 11190, 4295, 2064, 2022, 11860, 2000, 8351, 1012, 102], [101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 22, 42, 16, 21, 35, 37, 16, 21, 7, -100], [-100, 22, 22, -100], [-1

In [6]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /home/emukans/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-eaf3312fba82cd57.arrow
Loading cached processed dataset at /home/emukans/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-f9fca18b1e3311fd.arrow
Loading cached processed dataset at /home/emukans/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-2c0c351f8b46192b.arrow


## Model training

### Existing model loading

In [7]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [8]:
from transformers import TrainingArguments, Trainer

batch_size = 128
early_stopping_patience = 5

args = TrainingArguments(
    f"{model_checkpoint}-finetuned-pos",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [10]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [11]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [32]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: pos_tags, ner_tags, id, chunk_tags, tokens. If pos_tags, ner_tags, id, chunk_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 330


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.619398,0.819528,0.827005,0.82325,0.8714
2,No log,0.44255,0.869425,0.871746,0.870584,0.900106
3,No log,0.416922,0.876825,0.882931,0.879868,0.905571


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: pos_tags, ner_tags, id, chunk_tags, tokens. If pos_tags, ner_tags, id, chunk_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: pos_tags, ner_tags, id, chunk_tags, tokens. If pos_tags, ner_tags, id, chunk_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forw

TrainOutput(global_step=330, training_loss=0.8422676780007102, metrics={'train_runtime': 88.5726, 'train_samples_per_second': 475.576, 'train_steps_per_second': 3.726, 'total_flos': 671518763395206.0, 'train_loss': 0.8422676780007102, 'epoch': 3.0})

In [18]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: pos_tags, ner_tags, id, chunk_tags, tokens. If pos_tags, ner_tags, id, chunk_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128


{'eval_loss': 0.41692161560058594,
 'eval_precision': 0.8768252132427353,
 'eval_recall': 0.8829309717335921,
 'eval_f1': 0.8798675000302235,
 'eval_accuracy': 0.9055713536784914,
 'eval_runtime': 4.2722,
 'eval_samples_per_second': 760.731,
 'eval_steps_per_second': 6.086,
 'epoch': 3.0}

In [19]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: pos_tags, ner_tags, id, chunk_tags, tokens. If pos_tags, ner_tags, id, chunk_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3250
  Batch size = 128


{"'": {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11},
 'B': {'precision': 0.7939240506329114,
  'recall': 0.8222338751966439,
  'f1': 0.8078310149407523,
  'number': 1907},
 'BD': {'precision': 0.926283457656867,
  'recall': 0.9491906474820144,
  'f1': 0.9375971574505886,
  'number': 2224},
 'BG': {'precision': 0.8772727272727273,
  'recall': 0.8283261802575107,
  'f1': 0.8520971302428255,
  'number': 699},
 'BN': {'precision': 0.8486238532110092,
  'recall': 0.7974137931034483,
  'f1': 0.8222222222222222,
  'number': 928},
 'BP': {'precision': 0.8672566371681416,
  'recall': 0.8054794520547945,
  'f1': 0.8352272727272728,
  'number': 365},
 'BR': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 53},
 'BS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 18},
 'BZ': {'precision': 0.9166666666666666,
  'recall': 0.9076620825147348,
  'f1': 0.912142152023692,
  'number': 509},
 'C': {'precision': 0.9967845659163987,
  'recall': 0.9978540772532188,
  'f1': 0.99731

### Training own adapter layer

In [12]:
from transformers import AutoAdapterModel, AdapterConfig


model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

adapter_config = AdapterConfig.load("pfeiffer")

model.add_adapter("pos", config=adapter_config)
model.set_active_adapters("pos")
model.train_adapter("pos")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 330


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,3.280522,0.061359,0.038069,0.046986,0.209351
2,No log,2.6786,0.079088,0.025767,0.03887,0.246858
3,No log,2.558509,0.119166,0.039646,0.059497,0.268734


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forw

TrainOutput(global_step=330, training_loss=3.0687884706439394, metrics={'train_runtime': 63.9031, 'train_samples_per_second': 659.17, 'train_steps_per_second': 5.164, 'total_flos': 678574955012550.0, 'train_loss': 3.0687884706439394, 'epoch': 3.0})

In [15]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 128


{'eval_loss': 2.558509111404419,
 'eval_precision': 0.1191656942823804,
 'eval_recall': 0.03964576003882082,
 'eval_f1': 0.059497150763740965,
 'eval_accuracy': 0.2687340143295153,
 'eval_runtime': 4.5533,
 'eval_samples_per_second': 713.775,
 'eval_steps_per_second': 5.71,
 'epoch': 3.0}

In [16]:
trainer.evaluate(tokenized_datasets["test"])

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3453
  Batch size = 128


{'eval_loss': 2.3747849464416504,
 'eval_precision': 0.1593105437761431,
 'eval_recall': 0.06172115115954177,
 'eval_f1': 0.08897212824230707,
 'eval_accuracy': 0.34509011808576756,
 'eval_runtime': 4.3058,
 'eval_samples_per_second': 801.934,
 'eval_steps_per_second': 6.271,
 'epoch': 3.0}

In [17]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: chunk_tags, id, ner_tags, tokens, pos_tags. If chunk_tags, id, ner_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 128


{"'": {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14},
 'B': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1625},
 'BD': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1687},
 'BG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 480},
 'BN': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 816},
 'BP': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 331},
 'BR': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 43},
 'BS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 9},
 'BZ': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 501},
 'C': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 765},
 'D': {'precision': 0.171667965705378,
  'recall': 0.25558456628952714,
  'f1': 0.2053852430353188,
  'number': 3447},
 'DT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 115},
 'H': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 7},
 'J': {'precision': 0.020833333333333332,
  'recall': 0.00045167118337850