# Question-2 : Fine-tuning IndicBERT and IndicNER for telugu language using 10% dataset

## Installing Required Packages

In [None]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6252a60dce84b9b0bf2d010ae1aa297c9c398c30b6466210b918074a0fb619fe
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
!pip install accelerate -U
#NOTE: AFTER RUNNING THIS CELL,MAKESURE TO RESTART KERNEL RUNTIME AND THEN DONOT RERUN ANY CELLS WITH !pip install IN THEM,TO AVOID ERROR OCCURED WHILE EXECUTING args training arguments

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0


# Question-2

# Step-1: Downloading the Namapadam dataset

In [None]:
from datasets import load_dataset
import numpy as np
lang='te'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

Downloading builder script:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading and preparing dataset naamapadam_pr/te to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/te/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20...


Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset naamapadam_pr downloaded and prepared to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/te/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Step-2: Analyzing the dataset

In [None]:
# print the dataset to see how it is
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 507741
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 847
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2700
    })
})

In [None]:
named_entity_features=raw_datasets["train"].features

In [None]:
column_names = raw_datasets["train"].column_names
print(column_names)

['tokens', 'ner_tags']


In [None]:
# printing 100th row from train dataset in given Namapadam corpus
idx=100
rec=raw_datasets['train'][idx]
for w, t in zip(rec['tokens'],rec['ner_tags']):
  print('Word:{}\tNamed Entity:{}'.format(w,t))

Word:రూ	Named Entity:0
Word:.	Named Entity:0
Word:50,000	Named Entity:0
Word:డిస్కౌంట్	Named Entity:0


The named entities are labelled as integers,so let's try to get the corresponding named entities for those integers

In [None]:
labels_list = named_entity_features['ner_tags'].feature.names

map_label_to_id={} # dictionary to store the mapping between label and its corresponding integer id
for i in range(len(labels_list)):
  map_label_to_id[labels_list[i]]=named_entity_features['ner_tags'].feature.str2int(labels_list[i])
for label_id in map_label_to_id:
  print('label:{}\t Integer Id:{}'.format(label_id,map_label_to_id[label_id]))
print(map_label_to_id)
num_labels=len(labels_list)

label:O	 Integer Id:0
label:B-PER	 Integer Id:1
label:I-PER	 Integer Id:2
label:B-ORG	 Integer Id:3
label:I-ORG	 Integer Id:4
label:B-LOC	 Integer Id:5
label:I-LOC	 Integer Id:6
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


# Step-3: **Fine-tuning IndicBERT** using same functions for IndicNER

In [7]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np

config_bert = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=7, finetuning_task='ner')
tokenizer_bert = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model_bert = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', config=config_bert)

2024-03-13 04:47:47.078027: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 04:47:47.078130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 04:47:47.210632: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
%pip install protobuf


Collecting protobuf

  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)

[K     |████████████████████████████████| 294 kB 261 kB/s eta 0:00:01

[?25hInstalling collected packages: protobuf

Successfully installed protobuf-4.25.3

Note: you may need to restart the kernel to use updated packages.


In [None]:
# moving the model to Colab GPU
model_bert=model_bert.to("cuda")

In [8]:
# tokenize the input text and aligns the labels with the corresponding tokens,tokenizer of IndicNER will only generate tokens specific to model,but this function will also assign NER to the tokens,which allows the model to learn the relationship between the input tokens and their corresponding entity labels during training.
def tokenize_and_align_labels_bert(examples):
    tokenized_inputs = tokenizer_bert(
        examples["tokens"], #input tokens
        padding="max_length", #Pad the tokenized inputs to a maximum length specified with the argument max_length
        truncation=True,#truncate the tokenized inputs if they exceed the maximum length
        max_length=512,#maximum length of tokenized input
        is_split_into_words=True,# We use this argument because the texts in our dataset are lists of words (with a label for each word).So,tokenizer should not further split the tokens into subwords or characters.
    )
    aligned_labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        #print("word_ids ",i," ",word_ids)
        previous_word_idx = None
        label_ids = []
        for curr_word_id in word_ids:#For each word ID, retrieve the corresponding NER tag from the labels
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if curr_word_id is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.(checks if the current token is the first token of its corresponding word,as a single word may be split into multiple tokens while tokenizing text)
            elif curr_word_id != previous_word_idx:
                label_ids.append(label[curr_word_id])
            # For the other tokens in a word(it's part of a word that has already been started by a previous token), we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.If label_all_tokens is True, then the label for the current token is assigned based on the label for that word. If it's False, then the label for non-first tokens is set to -100 to ignore them during training.
            # but by default we set the label to -100 for non-first tokens.
            else:
                label_ids.append(-100)
            previous_word_idx = curr_word_id

        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

Get training,eval datasets

In [9]:
eval_dataset_bert = raw_datasets["validation"] #tokenizing and aligning the labels for the validation dataset
eval_dataset_bert = eval_dataset_bert.map(
    tokenize_and_align_labels_bert,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)

       

Running tokenizer on Validation dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on Validation dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
test_dataset_bert = raw_datasets["test"]
test_dataset_bert = test_dataset_bert.map(
    tokenize_and_align_labels_bert,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
)

       

Running tokenizer on test dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Create collator,metrics

In [11]:
#data_collator
data_collator_bert = DataCollatorForTokenClassification(tokenizer_bert)

In [17]:
# Metrics
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics_bert(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    try:
      if results["overall_f1"]:
        final_results["f1"] = results["overall_f1"]
    except KeyError:
        pass  # If "overall_f1" is not present in results, do nothing
    return final_results

In [16]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=77a32826b9642b458e24e88c845d8ac35395f1e0717d929c5040f5e44d85f929
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Note: you may need to restart the kernel to use updated packages.


Training arguments

In [14]:
args = TrainingArguments(
    output_dir='./checkpoints',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-5,  # Set the learning rate
    weight_decay=0.01,    # Set the weight decay
    gradient_accumulation_steps=1,  # Number of updates steps to accumulate before performing a backward/update pass.
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,   # Log every n updates steps
    save_strategy="epoch",  # Save model after each epoch
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_total_limit=1,  # Limit the total amount of saved models
    label_smoothing_factor=0.1,  # Apply label smoothing for regularization
)


In [15]:
train_dataset_bert = raw_datasets["train"] # given train dataset
# Shuffle the train dataset
train_dataset_bert = train_dataset_bert.shuffle(seed=42)

# Get the first 25% of the examples
num_examples = len(train_dataset_bert)
subset_size = int(num_examples * 0.10)
train_subset_bert = train_dataset_bert.select(range(subset_size))

# Tokenize and align labels for the subset
train_subset_bert = train_subset_bert.map(
    tokenize_and_align_labels_bert,  # Preprocess each example using the tokenize_and_align_labels function
    batched=True,  # Use batch processing instead of individual processing
    num_proc=4,  # Number of processes to use for parallel processing
    load_from_cache_file=True,  # Load from cache file if available
    desc="Running tokenizer on 10% subset of train dataset",  # Description for the progress bar
)

      

Running tokenizer on 10% subset of train dataset #0:   0%|          | 0/13 [00:00<?, ?ba/s]

Running tokenizer on 10% subset of train dataset #1:   0%|          | 0/13 [00:00<?, ?ba/s]

  

Running tokenizer on 10% subset of train dataset #3:   0%|          | 0/13 [00:00<?, ?ba/s]

Running tokenizer on 10% subset of train dataset #2:   0%|          | 0/13 [00:00<?, ?ba/s]

In [18]:
trainer_bert = Trainer(
    model=model_bert,
    train_dataset=train_subset_bert,
    eval_dataset=eval_dataset_bert,
    tokenizer=tokenizer_bert,
    data_collator=data_collator_bert,
    compute_metrics=compute_metrics_bert,
    #callbacks=[SaveCheckpointsCallback()],
    args=args,
)

Training the model

In [19]:
# Train the model
# Train the model
import warnings
warnings.filterwarnings("ignore")
import wandb
# Initialize wandb
wandb.init(project="your_project_name")

# Set your API key
wandb.login(key="009ac958cc59c78ab471adbb4d25b34b06a416ba")
train_result_bert = trainer_bert.train()
metrics_bert = train_result_bert.metrics

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,F1
1,0.6881,0.697318,0.685832,0.639847,0.662042,1044,0.586623,0.515911,0.548999,1037,0.695445,0.65519,0.674717,2004,0.666667,0.615912,0.640285,0.8883,0.640285
2,0.6928,0.674694,0.689655,0.689655,0.689655,1044,0.593,0.571842,0.582229,1037,0.716106,0.701098,0.708522,2004,0.678482,0.665361,0.671858,0.897455,0.671858
3,0.576,0.672778,0.684162,0.711686,0.697653,1044,0.6167,0.591128,0.603644,1037,0.718014,0.7001,0.708944,2004,0.683937,0.675398,0.67964,0.898933,0.67964


In [20]:
#trainer.log_metrics("train", train_result.metrics)
print(metrics_bert)

{'train_runtime': 7211.2487, 'train_samples_per_second': 21.123, 'train_steps_per_second': 1.32, 'total_flos': 3365648264374272.0, 'train_loss': 0.6960523076429008, 'epoch': 3.0}


Evaluating the model using test dataset

In [21]:
metrics_bert = trainer_bert.evaluate()
trainer_bert.log_metrics("eval", metrics_bert)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.6977
  eval_LOC_number         =       1044
  eval_LOC_precision      =     0.6842
  eval_LOC_recall         =     0.7117
  eval_ORG_f1             =     0.6036
  eval_ORG_number         =       1037
  eval_ORG_precision      =     0.6167
  eval_ORG_recall         =     0.5911
  eval_PER_f1             =     0.7089
  eval_PER_number         =       2004
  eval_PER_precision      =      0.718
  eval_PER_recall         =     0.7001
  eval_f1                 =     0.6796
  eval_loss               =     0.6728
  eval_overall_accuracy   =     0.8989
  eval_overall_f1         =     0.6796
  eval_overall_precision  =     0.6839
  eval_overall_recall     =     0.6754
  eval_runtime            = 0:00:51.92
  eval_samples_per_second =       52.0
  eval_steps_per_second   =      3.255


In [22]:
print(metrics_bert['eval_overall_f1'])

0.6796403497967729


In [23]:
# Evaluate the model on the test dataset
predictions_bert, labels_bert, metrics_bert = trainer_bert.predict(test_dataset_bert)
print(metrics_bert['test_overall_f1'])

0.7439724454649828


In [24]:
trainer_bert.log_metrics("test", metrics_bert)

***** test metrics *****
  test_LOC_f1             =     0.7514
  test_LOC_number         =        483
  test_LOC_precision      =      0.814
  test_LOC_recall         =     0.6977
  test_ORG_f1             =     0.6133
  test_ORG_number         =        263
  test_ORG_precision      =     0.6305
  test_ORG_recall         =      0.597
  test_PER_f1             =      0.794
  test_PER_number         =        609
  test_PER_precision      =     0.8034
  test_PER_recall         =     0.7849
  test_f1                 =      0.744
  test_loss               =     0.6185
  test_overall_accuracy   =     0.9209
  test_overall_f1         =      0.744
  test_overall_precision  =     0.7727
  test_overall_recall     =     0.7173
  test_runtime            = 0:00:15.78
  test_samples_per_second =     53.663
  test_steps_per_second   =      3.358


In [25]:
num_examples = 5
for i in range(num_examples):
    print(f"Example {i+1}:")
    print("Predictions:", predictions_bert[i])
    print("Labels:", labels_bert[i])
    print()

Example 1:
Predictions: [[ 0.02747323  0.09127338  0.03217009 ... -0.06853969  0.06333628
  -0.05514202]
 [ 3.4322991  -0.5697573  -0.46920225 ... -0.7663381  -0.69995725
  -0.9932934 ]
 [ 3.277536   -0.56893706 -0.72395533 ... -0.70280385 -0.7919754
  -1.1447008 ]
 ...
 [ 3.0009708  -0.19203492 -1.028371   ... -0.9024608  -0.39280313
  -1.427685  ]
 [ 2.9981792  -0.18798897 -1.0308129  ... -0.904096   -0.3925257
  -1.4307091 ]
 [ 3.0005999  -0.18785278 -1.0282853  ... -0.90428054 -0.39362112
  -1.4268129 ]]
Labels: [-100    0    0 -100 -100 -100    0 -100 -100 -100    1 -100 -100 -100
    2 -100 -100    2 -100    0    0 -100 -100 -100    0    0 -100 -100
 -100    0 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -

Saving the model

In [27]:
trainer_bert.save_model('CustomModel_BERT')