<a href="https://colab.research.google.com/github/Chiamakac/IgboNER-Models/blob/main/Fine-Tuning/Fine_tuned_Token_Classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Token Classification (PyTorch)-Fine-Tunning 

Install the Transformers and Datasets libraries to run this notebook.

In [None]:
!pip install datasets transformers[sentencepiece]
#!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#We use the load_dataset() method from the Datasets library to download our dataset.
from datasets import load_dataset

raw_datasets = load_dataset('masakhaner', 'ibo')

In [None]:
#shows us the columns present and the split between the training, validation, and test sets
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2235
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 320
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 638
    })
})

In [None]:
# displays the first element of the training set
raw_datasets["train"][0]["tokens"]

['Ike',
 'ịda',
 'jụụ',
 'otụ',
 'nkeji',
 'banyere',
 'oke',
 'ogbugbu',
 'na',
 '-',
 'eme',
 "n'ala",
 'Naijiria',
 'agwụla',
 'Ekweremmadụ']

In [None]:
#displays the features attribute of our dataset
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE'], names_file=None, id=None), length=-1, id=None)

In [None]:
#we can access the list of names in the ner_feature by looking at the names attribute of that feature
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE']

In [None]:
#decoding the labels we saw earlier and prints line 1 and line 2 of the training set with the labels
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

In [None]:
#mount the gdrive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#model path in the gdrive
MODEL_PATH = "/content/gdrive/MyDrive/IBO_BETA/IgboBert" #replace with the model path you want to work with

**Processing the data**

In [None]:
#we will be using a IgboBERT pretrained model.
#Download and cache the associated tokenizer

#You can replace the model_checkpoint with any other model you prefer from the Hub, 
#or with a local folder in which you’ve saved a pretrained model and a tokenizer
from transformers import AutoTokenizer

model_checkpoint = MODEL_PATH
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

In [None]:
#Tokenizing our pre-tokenized input with our tokenizer
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 'ĠIke',
 'Ġá»ĭda',
 'Ġjá»¥á»¥',
 'Ġotá»¥',
 'Ġnkeji',
 'Ġbanyere',
 'Ġoke',
 'Ġogbugbu',
 'Ġna',
 'Ġ-',
 'Ġeme',
 'Ġn',
 "'",
 'ala',
 'ĠNaijiria',
 'Ġagwá»¥la',
 'ĠEkweremmadá»¥',
 '</s>']

In [None]:
# Expanding our label list to match the tokens and assigning a label of -100 to special tokens 
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
# trying out the above line of code on our first sentence to see if -100 was assigned to the special tokens
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1]
[-100, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, -100]


In [None]:
#we tokenize all the inputs and apply align_labels_with_tokens() on all the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
#applying all that preprocessing in one go on the other splits of our dataset
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

**Fine-tuning the model with the Trainer API**

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    7,    8,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    5,    0,    1, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    5,    0,    1,
            2,    0,    0,    0,    0,    0,    0,    0,    0,    7,    8,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    5,
            0,    0,    0,    0, -100]])


To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and    labels, and returns a dictionary with the metric names and values.To use this metric, we first need to install the seqeval library.

In [None]:
!pip install seqeval

In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
#defining the model we want to finetune
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at /content/gdrive/MyDrive/IBO_BETA/IgboBert were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/gdrive/MyDrive/IBO_BETA/IgboBert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

In [None]:
#log in to Hugging Face 
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#we define our TrainingArguments
from transformers import TrainingArguments

args = TrainingArguments(
    "IgboBert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=20,
    weight_decay=0.01,
    #push_to_hub=True, (uncomment if you want to upload your results to the Model Hub)
)

In [None]:
#we pass everything to the Trainer and start training
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

**Using the fine-tuned model**

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/content/IgboBert-finetuned-ner/checkpoint-5600"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Google chetara ụbọchị ọmụmụ Keshi mgbe Obasanjo chịrị Naịjirịa afọ asatọ na ọchịchị onye kwuo uche ya."
                 "Onyeisi ndị na-emenyu ọkụ na, Legọọsi steeti bụ Rasak Fadipe ekwuola na onweghi onye nwụrụ n'ime ọkụ ahụ gbara n'ehihie ụbọchị 24 Jenuwarị, 2018.")

loading configuration file /content/IgboBert-finetuned-ner/checkpoint-5600/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/IgboBert-finetuned-ner/checkpoint-5600",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-DATE",
    "8": "I-DATE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DATE": "7",
    "B-LOC": "5",
    "B-ORG": "3",
    "B-PER": "1",
    "I-DATE": "8",
    "I-LOC": "6",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "p

[{'end': 6,
  'entity_group': 'ORG',
  'score': 0.99998724,
  'start': 0,
  'word': ' Google'},
 {'end': 21,
  'entity_group': 'DATE',
  'score': 0.99948347,
  'start': 15,
  'word': ' ụbọchị'},
 {'end': 33,
  'entity_group': 'PER',
  'score': 0.99567986,
  'start': 28,
  'word': ' Keshi'},
 {'end': 47,
  'entity_group': 'PER',
  'score': 0.99999,
  'start': 39,
  'word': ' Obasanjo'},
 {'end': 62,
  'entity_group': 'LOC',
  'score': 0.9999815,
  'start': 54,
  'word': ' Naịjirịa'},
 {'end': 72,
  'entity_group': 'DATE',
  'score': 0.9996765,
  'start': 63,
  'word': ' afọ asatọ'},
 {'end': 146,
  'entity_group': 'LOC',
  'score': 0.99975836,
  'start': 132,
  'word': ' Legọọsi steeti'},
 {'end': 162,
  'entity_group': 'PER',
  'score': 0.99998444,
  'start': 150,
  'word': ' Rasak Fadipe'},
 {'end': 241,
  'entity_group': 'DATE',
  'score': 0.98287994,
  'start': 214,
  'word': "'ehihie ụbọchị 24 Jenuwarị,"},
 {'end': 246,
  'entity_group': 'DATE',
  'score': 0.99987113,
  'start': 24

In [None]:
#save the fine-tuned model in gdrive
import shutil
shutil.move('/content/IgboBert-finetuned-ner/checkpoint-5600','/content/gdrive/MyDrive/IBO_BETA/ LREC FINAL TRAINING')

'/content/gdrive/MyDrive/IBO_BETA/ LREC FINAL TRAINING/checkpoint-5600'