#Reference:
https://github.com/rohan-paul/MachineLearning-DeepLearning-Code-for-my-YouTube-Channel/blob/master/NLP/YT_Fine_tuning_BERT_NER_v1.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class SentenceGetter(object):
    
    def __init__(self, filenames_list):
        self.sentences = []
        self.words = set()
        self.tags = set()
        for filename in filenames_list:
          with open(filename) as f:
              sentence = []
              for line in f:
                  line = line.strip()
                  if (len(line) == 0 or line.startswith("-DOCSTART-") or line.startswith("......")):
                      if len(sentence) != 0:
                        self.sentences.append(sentence)
                        sentence = []
                      continue
                  else:
                      ls = line.split(' ')
                      word, tag = ls[0],ls[3]
                      self.words.add(word)
                      self.tags.add(tag)
                      sentence.append((word,tag))

In [None]:
#Train
files = ['/content/drive/MyDrive/MasterThesis/coNLL/train.txt']
getter = SentenceGetter(files)
sentences = getter.sentences
words = getter.words
tags = list(getter.tags)

In [None]:
#Valid
files_valid = ['/content/drive/MyDrive/MasterThesis/coNLL/valid.txt']
getter_valid = SentenceGetter(files_valid)
sentences_valid = getter_valid.sentences

In [None]:
#Test
files_test = ['/content/drive/MyDrive/MasterThesis/coNLL/test.txt']
getter_test = SentenceGetter(files_test)
sentences_test = getter_test.sentences

In [None]:
tags

['B-ORG', 'I-MISC', 'O', 'I-ORG', 'B-PER', 'B-LOC', 'B-MISC', 'I-LOC', 'I-PER']

In [None]:
from future.utils import iteritems


tag2idx = {t: i for i, t in enumerate(tags)}

idx2tag = {v: k for k, v in iteritems(tag2idx)}

In [None]:
sentences[0:2]

[[('EU', 'B-ORG'),
  ('rejects', 'O'),
  ('German', 'B-MISC'),
  ('call', 'O'),
  ('to', 'O'),
  ('boycott', 'O'),
  ('British', 'B-MISC'),
  ('lamb', 'O'),
  ('.', 'O')],
 [('Peter', 'B-PER'), ('Blackburn', 'I-PER')]]

In [None]:
tag2idx

{'B-ORG': 0,
 'I-MISC': 1,
 'O': 2,
 'I-ORG': 3,
 'B-PER': 4,
 'B-LOC': 5,
 'B-MISC': 6,
 'I-LOC': 7,
 'I-PER': 8}

In [None]:
#sentance tag seperator
def sentence_tag_seperator(sentence_list, tags_dict):
  tokens = []
  tags = []
  for sentence in sentence_list:
    inner_sentence_words = []
    inner_sentence_tags = []
    for word in sentence:
      inner_sentence_words.append(word[0])
      inner_sentence_tags.append(tag2idx[word[1]])
    tokens.append(inner_sentence_words)
    tags.append(inner_sentence_tags)

  return {"tokens" : tokens,
          "tags" : tags}

In [None]:
test = sentence_tag_seperator(sentences[0:2],tag2idx)

In [None]:
test

{'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'tags': [[0, 2, 6, 2, 2, 2, 6, 2, 2], [4, 8]]}

In [None]:
#Train data
train_data = sentence_tag_seperator(sentences,tag2idx)

In [None]:
#Valid data
valid_data = sentence_tag_seperator(sentences_valid,tag2idx)

In [None]:
#Test data
test_data = sentence_tag_seperator(sentences_test,tag2idx)

In [None]:
test_data.keys()

dict_keys(['tokens', 'tags'])

In [None]:
!pip install transformers datasets tokenizers seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m45.1 MB/s[0m e

In [None]:
import numpy as np
import datasets 
from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorForTokenClassification

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_data['tokens'][0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [None]:
def tokenize_and_align_labels(example_data, label_all_tokens = True):
  tokenized_inputs = tokenizer(example_data['tokens'], is_split_into_words=True, truncation=True)
  labels = []

  for i, lable in enumerate(example_data['tags']):
    word_ids = tokenized_inputs.word_ids(batch_index=i)

    previous_word_idx = None

    label_ids = []

    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        label_ids.append(lable[word_idx])
      else:
        label_ids.append(lable[word_idx] if label_all_tokens else -100)
      
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  tokenized_inputs["tokens"] = example_data['tokens']
  tokenized_inputs["tags"] = example_data['tags']
  return tokenized_inputs

In [None]:
#Final tokenized train data
tokenized_dataset_train = tokenize_and_align_labels(train_data)
#convert to apache arrow Datasets to train the model
tokenized_dataset_train = datasets.Dataset.from_dict(tokenized_dataset_train)

In [None]:
tokenized_dataset_train[0]

{'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 2, 6, 2, 2, 2, 6, 2, 2, -100],
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'tags': [0, 2, 6, 2, 2, 2, 6, 2, 2]}

In [None]:
#Final tokenized valid data
tokenized_dataset_valid = tokenize_and_align_labels(valid_data)
tokenized_dataset_valid = datasets.Dataset.from_dict(tokenized_dataset_valid)

In [None]:
#Final tokenized test data
tokenized_dataset_test = tokenize_and_align_labels(test_data)
tokenized_dataset_test = datasets.Dataset.from_dict(tokenized_dataset_test)

In [None]:
tokenized_dataset_test

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'tokens', 'tags'],
    num_rows: 3453
})

In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels = 9)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
print(model)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
args = TrainingArguments(
    '/content/drive/MyDrive/MasterThesis/Bert/test-ner',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers.data.data_collator import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = datasets.load_metric("seqeval") 

In [None]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [tags[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [tags[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_dataset_train, 
   eval_dataset=tokenized_dataset_valid, 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2634
  Number of trainable parameters = 108898569


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0516,0.057518,0.930271,0.940262,0.93524,0.98521
2,0.0346,0.056292,0.935046,0.94854,0.941745,0.986211
3,0.0177,0.057557,0.938985,0.95033,0.944624,0.986941


Saving model checkpoint to /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoint-500
Configuration saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Shreya/MasterThesis/Bert/test-ner/checkpoi

TrainOutput(global_step=2634, training_loss=0.03486428273475432, metrics={'train_runtime': 502.1905, 'train_samples_per_second': 83.879, 'train_steps_per_second': 5.245, 'total_flos': 1021316467278600.0, 'train_loss': 0.03486428273475432, 'epoch': 3.0})

In [None]:
predictions, label_ids, metrics  = trainer.predict(test_dataset = tokenized_dataset_test)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 16


In [None]:
metrics

{'test_loss': 0.13817374408245087,
 'test_precision': 0.8923745740806016,
 'test_recall': 0.9013766911939236,
 'test_f1': 0.8968530436322844,
 'test_accuracy': 0.9759685104619846,
 'test_runtime': 9.4814,
 'test_samples_per_second': 364.187,
 'test_steps_per_second': 22.781}

In [None]:
i = np.random.randint(0,tokenized_dataset_test.shape[0]) # choose a random number between 0 and len(X_te)
p, l, m = trainer.predict([tokenized_dataset_test[i]])
p = np.argmax(p, axis=-1)

true = tokenized_dataset_test[i]['tags']

print("Sample number {} of {} (Test Set)".format(i, tokenized_dataset_test.shape[0]))
# Visualization
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")

for w, t, pred in zip(tokenized_dataset_test[i]['tokens'], true, p[0][1:(len(p[0])-1)]):
    if w != 0:
        print("{:15}: {:5} {}".format(w, idx2tag[t], idx2tag[pred]))

***** Running Prediction *****
  Num examples = 1
  Batch size = 16
The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.


  _warn_prf(average, modifier, msg_start, len(result))


Sample number 3174 of 3453 (Test Set)
Word           ||True ||Pred
Former         : O     O
Australia      : B-LOC B-LOC
test           : O     O
batsman        : O     O
Dean           : B-PER B-PER
Jones          : I-PER I-PER
hit            : O     O
an             : O     O
unbeaten       : O     O
130            : O     O
to             : O     O
lead           : O     O
Victoria       : B-LOC B-LOC
's             : O     O
fightback      : O     O
in             : O     O
their          : O     O
Sheffield      : B-MISC O
Shield         : I-MISC O
match          : O     B-MISC
against        : O     I-MISC
Tasmania       : B-ORG O
on             : O     O
Saturday       : O     B-LOC
.              : O     O


In [None]:
model.save_pretrained("/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model")

Configuration saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model/config.json
Model weights saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model/pytorch_model.bin


In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer")

tokenizer config file saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/special_tokens_map.json


('/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/vocab.txt',
 '/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_tokenizer/tokenizer.json')

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model")

loading configuration file /content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Shreya/MasterThesis/Bert/conll_ner_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "I-ORG",
    "1": "B-LOC",
    "2": "B-MISC",
    "3": "B-PER",
    "4": "O",
    "5": "I-MISC",
    "6": "I-PER",
    "7": "I-LOC",
    "8": "B-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": "1",
    "B-MISC": "2",
    "B-ORG": "8",
    "B-PER": "3",
    "I-LOC": "7",
    "I-MISC": "5",
    "I-ORG": "0",
    "I-PER": "6",
    "O": "4"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Bill Gates is the Founder of Microsoft"

ner_results = nlp(example)

# print(ner_results)

print("{:15}{:15}{}".format("Word", "Pred-Tag", "Score"))

for result in ner_results:
  print("{:15} {:15} {}".format(result['word'], result['entity'], result['score']))

Word           Pred-Tag       Score
bill            B-PER           0.9977179765701294
gates           I-PER           0.9968332648277283
microsoft       B-ORG           0.9755874276161194
