## Finetuning BERT

In [1]:
!pip install -q transformers accelerate tokenizers seqeval evaluate datasets

### Import Libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [3]:
import os
os.environ["WANDB_DISABLE"] = 'true'

In [4]:
from datasets import load_dataset
conll2003 = load_dataset("gaurav98095/ner-dataset")

In [5]:
conll2003.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
conll2003['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'chunk_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'ner_tags': [1, 0, 2, 0, 0, 0, 2, 0, 0]}

In [7]:
conll2003["train"].features

{'id': Value('string'),
 'tokens': List(Value('string')),
 'pos_tags': List(ClassLabel(names=['X'])),
 'chunk_tags': List(ClassLabel(names=['X'])),
 'ner_tags': List(ClassLabel(names=['O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']))}

In [8]:
conll2003["train"].features['ner_tags']

List(ClassLabel(names=['O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']))

### Load Tokenizer

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [10]:
tokenized_id = tokenizer(conll2003["train"][0]['tokens'],is_split_into_words=True)
print(tokenized_id)

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
tokens=tokenizer.convert_ids_to_tokens(tokenized_id['input_ids'])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [12]:
['O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
[0,1,2,3,4,5,6,7,8]

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [13]:
def tokenize_and_align_labels(examples, label_all_tokens=True):

    #tokeinze ids
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []


    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.

        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
q=tokenize_and_align_labels(conll2003["train"][4:5])
q

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [15]:
# {
#   "0": "O",
#   "1": "B-ORG",
#   "2": "B-MISC",
#   "3": "B-PER",
#   "4": "I-PER",
#   "5": "B-LOC",
#   "6": "I-ORG",
#   "7": "I-MISC",
#   "8": "I-LOC"
# }

In [16]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 1
union___________________________________ 6
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 3
z_______________________________________ 4
##wing__________________________________ 4
##mann__________________________________ 4
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [17]:
tokenized_datasets = conll2003.map(
    tokenize_and_align_labels,
    batched=True
)

In [18]:
tokenized_datasets["train"][4]

{'id': '4',
 'tokens': ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 'pos_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'chunk_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'ner_tags': [5,
  0,
  0,
  0,
  0,
  1,
  6,
  0,
  0,
  0,
  3,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  2762,
  1005,
  1055,
  4387,
  2000,
  1996,
  2647,
  2586,
  1005,
  1055,
  

### Load Model

In [19]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=9
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [39]:
label_list=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [40]:
import evaluate
metric = evaluate.load("seqeval")

Downloading builder script: 0.00B [00:00, ?B/s]

### Test

In [52]:
input=['EU','rejects','German','call','to','boycott','British','lamb','.']

actual_lable=['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

pred_lable=['B-ORG', 'O', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [53]:
metrics = metric.compute(predictions=[pred_lable],references=[actual_lable])
print(metrics['overall_precision'])
print(metrics['overall_f1'])
print(metrics['overall_accuracy'])

0.2
0.25
0.2222222222222222


### Finetune

In [56]:
from transformers import TrainingArguments, Trainer

#these are hyperparameter
args=TrainingArguments(
    "test-ner",
    eval_strategy='epoch',
    learning_rate=2e-5,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"  # Disable wandb logging
)

In [57]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [58]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
    ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
    }

In [59]:
trainer=Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [60]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2178,0.069739,0.920286,0.927777,0.924016,0.980842
2,0.0448,0.059595,0.934483,0.943437,0.938939,0.985244
3,0.0233,0.06109,0.940983,0.94457,0.942773,0.985915
4,0.014,0.064642,0.938499,0.949619,0.944026,0.986363
5,0.0098,0.065632,0.942174,0.950134,0.946137,0.98684


TrainOutput(global_step=4390, training_loss=0.05000546809481054, metrics={'train_runtime': 905.6235, 'train_samples_per_second': 77.521, 'train_steps_per_second': 4.847, 'total_flos': 1789594902451764.0, 'train_loss': 0.05000546809481054, 'epoch': 5.0})

In [61]:
model.save_pretrained("ner_fine_tuned_model")
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

### Update Config

In [62]:
import json
config = json.load(open("ner_fine_tuned_model/config.json"))

In [63]:
config

{'architectures': ['BertForTokenClassification'],
 'attention_probs_dropout_prob': 0.1,
 'classifier_dropout': None,
 'gradient_checkpointing': False,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'id2label': {'0': 'LABEL_0',
  '1': 'LABEL_1',
  '2': 'LABEL_2',
  '3': 'LABEL_3',
  '4': 'LABEL_4',
  '5': 'LABEL_5',
  '6': 'LABEL_6',
  '7': 'LABEL_7',
  '8': 'LABEL_8'},
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'label2id': {'LABEL_0': 0,
  'LABEL_1': 1,
  'LABEL_2': 2,
  'LABEL_3': 3,
  'LABEL_4': 4,
  'LABEL_5': 5,
  'LABEL_6': 6,
  'LABEL_7': 7,
  'LABEL_8': 8},
 'layer_norm_eps': 1e-12,
 'max_position_embeddings': 512,
 'model_type': 'bert',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'pad_token_id': 0,
 'position_embedding_type': 'absolute',
 'torch_dtype': 'float32',
 'transformers_version': '4.54.0',
 'type_vocab_size': 2,
 'use_cache': True,
 'vocab_size': 30522}

In [64]:
conll2003["train"].features["ner_tags"].feature. names

['O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [65]:
id2label = {str(i): label for i,label in enumerate(label_list)}
label2id = {label: str(i) for i,label in enumerate(label_list)}

In [66]:
id2label

{'0': 'O',
 '1': 'B-PER',
 '2': 'I-PER',
 '3': 'B-ORG',
 '4': 'I-ORG',
 '5': 'B-LOC',
 '6': 'I-LOC',
 '7': 'B-MISC',
 '8': 'I-MISC'}

In [67]:
label2id

{'O': '0',
 'B-PER': '1',
 'I-PER': '2',
 'B-ORG': '3',
 'I-ORG': '4',
 'B-LOC': '5',
 'I-LOC': '6',
 'B-MISC': '7',
 'I-MISC': '8'}

In [68]:
json.dump(config,open("ner_fine_tuned_model/config.json","w"))

### Test With Pipeline

In [133]:
from transformers import pipeline

model_path = "ner_fine_tuned_model"
tokenizer_path = "tokenizer"

In [134]:
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
model_fine_tuned = AutoModelForTokenClassification.from_pretrained(model_path)

In [135]:
nlp_pipeline = pipeline(
    "ner",
    model = model_fine_tuned,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [127]:
# {"O": "0",
#  "B-PER": "1",
#  "I-PER": "2",
#  "B-ORG": "3",
#  "I-ORG": "4",
#  "B-LOC": "5",
#  "I-LOC": "6",
#  "B-MISC": "7",
#  "I-MISC": "8"}

In [115]:
example="John is Data Scientist and Generative AI Engineer"

In [116]:
nlp_pipeline(example)

[{'entity': 'B-PER',
  'score': np.float32(0.98988336),
  'index': 1,
  'word': 'john',
  'start': 0,
  'end': 4}]

In [136]:
example2="apple launch mobile phone and i am eating apple."

In [137]:
nlp_pipeline(example2)

[{'entity': 'B-ORG',
  'score': np.float32(0.9983359),
  'index': 1,
  'word': 'apple',
  'start': 0,
  'end': 5},
 {'entity': 'B-MISC',
  'score': np.float32(0.84365207),
  'index': 9,
  'word': 'apple',
  'start': 42,
  'end': 47}]

### Push to HuggingFace Hub [Optional]

In [20]:
model_fine_tuned.push_to_hub("gaurav98095/NER-Model-Fine-Tuned")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpyy39oxta/model.safetensors    :   0%|          | 14.2kB /  436MB            

CommitInfo(commit_url='https://huggingface.co/gaurav98095/NER-Model-Fine-Tuned/commit/4525e66d6e700a9c7b09e7ccce291b1485e054bd', commit_message='Upload BertForTokenClassification', commit_description='', oid='4525e66d6e700a9c7b09e7ccce291b1485e054bd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gaurav98095/NER-Model-Fine-Tuned', endpoint='https://huggingface.co', repo_type='model', repo_id='gaurav98095/NER-Model-Fine-Tuned'), pr_revision=None, pr_num=None)