<a href="https://colab.research.google.com/github/gupta24789/hugging-face/blob/main/06_fine_tuning_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Token Classification

In [None]:
!rm -rf checkpoints_logs logs mlruns

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "0"

In [None]:
import itertools
import evaluate
import numpy as np
from pprint import pprint
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForTokenClassification, DataCollatorForTokenClassification

## Load Data

In [None]:
dataset = load_dataset("sg247/ner")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 33570
    })
    validation: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 7194
    })
    test: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 7194
    })
})

## Transform Data

In [None]:
dataset = dataset.map(lambda x: {"sentence":  eval(x['sentence']), "labels": eval(x['labels'])})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 33570
    })
    validation: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 7194
    })
    test: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 7194
    })
})

In [None]:
pprint(dataset['train'][0], compact = True)

{'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O',
            'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O',
            'O'],
 'sentence': ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through',
              'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and',
              'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from',
              'that', 'country', '.']}


In [None]:
unique_labels = list(set(itertools.chain.from_iterable(dataset['train'].to_pandas()['labels'].tolist())))
print(unique_labels)

id2label = dict(enumerate(unique_labels))
label2id = {w:i for i,w in id2label.items()}

['B-gpe', 'B-org', 'B-per', 'I-org', 'I-nat', 'I-geo', 'I-tim', 'B-eve', 'B-art', 'I-art', 'O', 'I-gpe', 'I-eve', 'B-tim', 'B-nat', 'I-per', 'B-geo']


In [None]:
dataset = dataset.map(lambda x: {"labels": [label2id[label] for label in x['labels']]})

Map:   0%|          | 0/33570 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

In [None]:
example = dataset['train'][0]
pprint(example, compact=True)

{'labels': [10, 10, 10, 10, 10, 10, 16, 10, 10, 10, 10, 10, 16, 10, 10, 10, 10,
            10, 0, 10, 10, 10, 10, 10],
 'sentence': ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through',
              'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and',
              'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from',
              'that', 'country', '.']}


In [None]:
model_name = "dslim/distilbert-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
inputs = tokenizer(example['sentence'], is_split_into_words=True)
inputs

{'input_ids': [101, 26159, 1104, 8568, 4487, 5067, 1138, 9639, 1194, 1498, 1106, 5641, 1103, 1594, 1107, 5008, 1105, 4555, 1103, 10602, 1104, 1418, 2830, 1121, 1115, 1583, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.decode(inputs['input_ids'])

'[CLS] Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country. [SEP]'

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset

Map:   0%|          | 0/33570 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 33570
    })
    validation: Dataset({
        features: ['sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7194
    })
    test: Dataset({
        features: ['sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7194
    })
})

In [None]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at dslim/distilbert-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="checkpoints_logs",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1747,0.102826,0.804568,0.818566,0.811507,0.968424
2,0.0967,0.09392,0.824543,0.827147,0.825843,0.97069
3,0.0816,0.090497,0.82702,0.831258,0.829133,0.971317
4,0.0725,0.090941,0.826589,0.836144,0.831339,0.971678
5,0.0671,0.091911,0.82892,0.837514,0.833195,0.971798


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2625, training_loss=0.09693791089739118, metrics={'train_runtime': 402.2692, 'train_samples_per_second': 417.258, 'train_steps_per_second': 6.525, 'total_flos': 2307060033162216.0, 'train_loss': 0.09693791089739118, 'epoch': 5.0})

## Test Accuracy

In [None]:
results = trainer.predict(tokenized_dataset['test'])
results[2]

{'test_loss': 0.09210649877786636,
 'test_precision': 0.8253578732106339,
 'test_recall': 0.8390354003326206,
 'test_f1': 0.8321404376896115,
 'test_accuracy': 0.9718371161206908,
 'test_runtime': 7.75,
 'test_samples_per_second': 928.263,
 'test_steps_per_second': 29.032}

## Inference

In [None]:
from transformers import pipeline
classifier = pipeline("ner", model= model , tokenizer = tokenizer)

In [None]:
example = dataset['test'][1110]
true_labels = [id2label[label] for label in example['labels']]
print("True Labels :")
[p for p in zip(example['sentence'], true_labels) if p[1]!='O']

True Labels :


[('since', 'B-tim'),
 ('May', 'I-tim'),
 ('Camp', 'B-geo'),
 ('Pendleton', 'I-geo'),
 ('California', 'B-geo')]

In [None]:
## Predictions
print("Pred Labels :")
preds = classifier(" ".join(example['sentence']))
[(p['word'], p['entity']) for p in preds]

Pred Labels :


[('since', 'B-tim'),
 ('May', 'I-tim'),
 ('Camp', 'B-geo'),
 ('Pendleton', 'I-geo'),
 ('California', 'B-geo')]