<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/TokenClassificationTask_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://www.youtube.com/watch?v=dzyDHMycx_c&t=1s

In [None]:
#code for token classification
!pip install transformers  tokenizers seqeval
!pip install datasets
!pip install evaluate

In [30]:
#Token Classification
#NER - B-PER / I-PER - person entity similarily for other Location and other entities ."O" indicates that token respresents no Entity
#Part of Speech Tagging
#Chunking

In [31]:
import datasets
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from tokenizers import Tokenizer

In [None]:
#load dataset
conll2003 = datasets.load_dataset("conll2003")

In [33]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
example_text = conll2003['train'][0]
tokenized_input = tokenizer(example_text['tokens'], is_split_into_words=True) #tokens
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
word_ids = tokenized_input.word_ids()

In [35]:
def tokenize_and_align_labels(example , label_all_tokens = True):
  #set - 100 as the label for the special tokens \
  tokenized_input = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
  labels = []
  for idx , label in enumerate(example['ner_tags']): # NER tags column name in dataset
    word_ids = tokenized_input.word_ids(batch_index=idx) # returns a word corresponding to each token from the dataset
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100) #- 100 ignored by PyTorch
      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])
      else:
        label_ids.append(label[word_idx] if label_all_tokens else -100) #for sub words
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_input['labels'] = labels
  return tokenized_input

In [None]:
q = tokenize_and_align_labels(conll2003['train'][4:5])

In [None]:
for token , label in zip(tokenizer.convert_ids_to_tokens(q['input_ids'][0]),q['labels'][0]):
  print((token,label))

In [None]:
tokenized_dataset = conll2003.map(tokenize_and_align_labels,batched=True) #apply tokenize and align labels across dataset

In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased',num_labels=9) # as we have 9 classes (PER, LOC etc and hence numlabels = 9)


In [38]:
from transformers import TrainingArguments , Trainer
args = TrainingArguments(
    'bert-finetuned-ner',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-5,
    num_train_epochs = 3,
    weight_decay = 0.01,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    load_best_model_at_end = True
)

In [None]:
import evaluate # Import the evaluate library
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval") # Use evaluate.load instead
example = conll2003['train'][0]
label_list = conll2003['train'].features['ner_tags'].feature.names
labels = [label_list[i] for i in example_text["ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

# Function to compute metrics
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    predictions = [
        [label_list[eval_preds] for (eval_preds,l) in zip(prediction, label) if l != -100] for prediction, label in zip(pred_logits, labels)
        ]

    # Remove ignored index (all labels = -100)
    true_labels = [label_list[l] for (eval_preds,l) in zip(predictions,label) if l!= -100 for prediction , label in zip(pred_logits,labels) ]

    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()
model.save_pretrained('bert-finetuned-ner')
tokenizer.save_pretrained('bert-finetuned-ner')

In [None]:
id2label= {
    str(i): label for i , label in enumerate(label_list)
}
label2id = {
    label: str(i) for i , label in enumerate(label_list)
}


In [None]:
import json
config = json.load(open('bert-finetuned-ner/config.json'))
config['id2label'] = id2label
config['label2id'] = label2id
json.dump(config,open('bert-finetuned-ner/config.json','w'))
model_fine_tuned = AutoModelForTokenClassification.from_pretrained('bert-finetuned-ner',config=config)

In [None]:
from transformers import pipeline
ner = pipeline('ner',model=model_fine_tuned,tokenizer=tokenizer)
example = "Bill Gates is from Microsoft"
ner_results = ner(example)
print(ner_results)