# Actual dataset:  "conllpp" from Huggingface
# Model used: "distilbert-base-cased"

In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!pip install transformers datasets tokenizers seqeval -q

In [3]:
import transformers
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification



In [4]:
import tensorflow as tf

In [5]:
print(tf.version.VERSION)

2.12.0


In [6]:
import datasets

In [7]:
model_checkpoint = "distilbert-base-cased"
dataset_checkpoint = "conllpp"

In [8]:
data = datasets.load_dataset(dataset_checkpoint)

  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [10]:
data['train']['tokens'][:2]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn']]

In [11]:
data['train']['ner_tags'][:2]

[[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]

In [12]:
ner_tags_names = data['train'].features['ner_tags'].feature
ner_tags_names

ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [13]:
id2label = {id:label for id, label in enumerate(ner_tags_names.names)}
label2id = {label:id for id, label in enumerate(ner_tags_names.names)}

In [14]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [15]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [16]:
ner_tags_names.int2str(3)

'B-ORG'

In [17]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
tokenizer.is_fast

True

In [20]:
data['train']['tokens'][0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [21]:
tokens = tokenizer(data['train']['tokens'][0], is_split_into_words=True)
tokens

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
tokens.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [23]:
tokens.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [24]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [25]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [26]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [27]:
tokenized_data['train'][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, -100, 0, -100]}

In [28]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='tf')

In [29]:
!pip install seqeval
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:
import evaluate

In [31]:
metrics = datasets.load_metric('seqeval')

In [32]:
label_list = ner_tags_names.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [33]:
import numpy as np

labels = [label_list[i] for i in data['train'][0]["ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metrics.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [34]:
model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=9, id2label=id2label, label2id=label2id)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [35]:
model.config.num_labels

9

In [36]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["validation"],
    shuffle=False,
    batch_size=32,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [37]:
from transformers import create_optimizer

batch_size = 32
num_train_epochs = 10
num_train_steps = (len(tokenized_data["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [38]:
model.compile(optimizer=optimizer)

In [39]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [40]:
callbacks = [metric_callback]

In [41]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a9d80633c70>

In [42]:
model.save_pretrained("ner_model")

In [43]:
tokenizer.save_pretrained("tokens")

('tokens/tokenizer_config.json',
 'tokens/special_tokens_map.json',
 'tokens/vocab.txt',
 'tokens/added_tokens.json',
 'tokens/tokenizer.json')

In [44]:
new_model_path = "/kaggle/working/ner_model"

In [45]:
tokens = AutoTokenizer.from_pretrained("/kaggle/working/tokens")
new_model = TFAutoModelForTokenClassification.from_pretrained(new_model_path)

Some layers from the model checkpoint at /kaggle/working/ner_model.keras were not used when initializing TFDistilBertForTokenClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at /kaggle/working/ner_model.keras and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
inputs = "The Golden State Warriors are an American professional basketball team based in San Francisco."
inp = tokens([inputs], return_tensors="tf")
out = new_model(**inp).logits
out

<tf.Tensor: shape=(1, 17, 9), dtype=float32, numpy=
array([[[ 4.6105013 , -1.3055582 , -1.4893236 , -0.92955494,
         -0.5084179 , -0.29408753, -0.74029875,  0.24912687,
         -0.8286134 ],
        [ 8.356394  , -1.5863156 , -2.2982218 ,  0.3212613 ,
         -1.5005149 , -1.1988349 , -1.6290014 , -1.064625  ,
         -1.3584299 ],
        [-2.0273747 , -1.5981116 , -0.8203154 ,  7.372251  ,
          1.9549774 , -0.48061478, -1.332257  , -2.3426347 ,
         -2.6489344 ],
        [-0.7967034 , -1.9938397 , -0.07606144, -2.6698425 ,
          7.5475516 , -2.1429923 ,  1.257052  , -1.8631741 ,
         -0.06672986],
        [-1.2376592 , -2.2666426 , -0.7616418 ,  0.14858189,
          7.876708  , -2.480684  , -1.2166831 , -1.130952  ,
         -0.2276679 ],
        [ 9.0570965 , -1.9341048 , -1.9898739 , -1.8230867 ,
         -1.1187083 , -1.5660899 , -1.1242694 , -0.8198882 ,
         -0.9841922 ],
        [ 9.007788  , -2.002184  , -2.2335541 , -1.6296247 ,
         -1.56205

In [47]:
predicted_token_class_ids = tf.math.argmax(out, axis=-1)
predicted_token_class = [new_model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
predicted_token_class

['O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'O']

# Thanks