<a href="https://colab.research.google.com/github/dMeVdok/experiments/blob/master/transformers_ner_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset, Dataset

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
label2id = model.config.label2id.copy()

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "I am in Ottawa"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-LOC', 'score': 0.9987392, 'index': 4, 'word': 'Ottawa', 'start': 8, 'end': 14}]


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Corona2.json to Corona2.json
User uploaded file "Corona2.json" with length 149564 bytes


In [None]:
import json
from tqdm.notebook import tqdm

corona = json.loads(uploaded['Corona2.json'])

In [None]:
data_texts = []
data_tags = []

data = {
    'tokens': [],
    'ner_tags': []
}

for example_index in tqdm(range(len(corona['examples']))):
  text = corona['examples'][example_index]['content']
  words = text.split()
  tags = []
  tags_to_find = []
  words_to_find = []
  for entity_index in range(len(corona['examples'][example_index]['annotations'])):
    start = corona['examples'][example_index]['annotations'][entity_index]['start']
    end = corona['examples'][example_index]['annotations'][entity_index]['end']
    tag = corona['examples'][example_index]['annotations'][entity_index]['tag_name']
    words_to_find.append(text[start:end])
    tags_to_find.append(tag)
  for w in words:
    for ow, ot in zip(words_to_find, tags_to_find):
      if w == ow:
        t = 'I-' + ot
        tags.append(t)
        if t not in label2id.keys():
          label2id[t] = max(label2id.values()) + 1
        break
    else:
      t = 'O'
      tags.append(t)
  data_texts.append(words)
  data_tags.append(tags)

id2label = {key: value for (value, key) in label2id.items()}

for tokens, tags in zip(data_texts, data_tags):
  data['tokens'].append(tokens)
  data['ner_tags'].append(
      [
          label2id[t]
          for t in tags
      ]
  )

  0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
label2id

{'B-LOC': 7,
 'B-MISC': 1,
 'B-ORG': 5,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 2,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 0,
 'I-Medicine': 9,
 'I-MedicalCondition': 10,
 'I-Pathogen': 11}

In [None]:
https://huggingface.co/learn/nlp-course/chapter7/2

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
ft_dataset = Dataset.from_dict(data)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)
    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
ft_dataset_tokenized = ft_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ft_dataset.column_names
)

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
ft_dataset_tokenized_splitted = ft_dataset_tokenized.train_test_split(test_size=0.3)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ft_dataset_tokenized_splitted["train"],
    eval_dataset=ft_dataset_tokenized_splitted["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored