# Part 1 - Transformers for Sequence Classification  
  The code here follows the tutorial completely.

In [1]:
from datasets import load_dataset
wnut = load_dataset("wnut_17")

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3" 

In [3]:
import torch
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

1
0
NVIDIA GeForce GTX 1080 Ti


In [4]:
wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [5]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [7]:
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokenized_input

{'input_ids': [101, 1030, 2703, 17122, 2009, 1005, 1055, 1996, 3193, 2013, 2073, 1045, 1005, 1049, 2542, 2005, 2048, 3134, 1012, 3400, 2110, 2311, 1027, 9686, 2497, 1012, 3492, 2919, 4040, 2182, 2197, 3944, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

In [12]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [17]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.279431,0.597015,0.2595,0.361757,0.93831
2,No log,0.27539,0.585895,0.300278,0.397059,0.940917


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=426, training_loss=0.21040113207320094, metrics={'train_runtime': 115.1283, 'train_samples_per_second': 58.96, 'train_steps_per_second': 3.7, 'total_flos': 91781128898820.0, 'train_loss': 0.21040113207320094, 'epoch': 2.0})

In [18]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."

In [19]:
from transformers import pipeline

classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
classifier(text)

  return self.fget.__get__(instance, owner)()
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'B-location',
  'score': 0.42658594,
  'index': 2,
  'word': 'golden',
  'start': 4,
  'end': 10},
 {'entity': 'I-location',
  'score': 0.35856345,
  'index': 3,
  'word': 'state',
  'start': 11,
  'end': 16},
 {'entity': 'B-group',
  'score': 0.30640018,
  'index': 4,
  'word': 'warriors',
  'start': 17,
  'end': 25},
 {'entity': 'B-location',
  'score': 0.6552351,
  'index': 13,
  'word': 'san',
  'start': 80,
  'end': 83},
 {'entity': 'B-location',
  'score': 0.4668664,
  'index': 14,
  'word': 'francisco',
  'start': 84,
  'end': 93}]

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
inputs = tokenizer(text, return_tensors="pt")

In [21]:
from transformers import AutoModelForTokenClassification
import torch

model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['O',
 'O',
 'B-location',
 'I-location',
 'B-group',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-location',
 'B-location',
 'O',
 'O']

# Part2‚Äì Using BERT with Our Data

In [1]:
def get_hindi_bio(inputfilename):
    with open(inputfilename, "r", encoding="utf-8") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []

        chunk_tokens = []
        chunk_ids = []

        for line in inputfile:
            if line.startswith("#"):
                continue

            columns = line.split()
            if not columns:
                if collection_words:
                    labels = []
                    i = 0
                    while i < len(chunk_ids):
                        cid = chunk_ids[i]
                        j = i + 1
                        while j < len(chunk_ids) and chunk_ids[j] == cid:
                            j += 1
                        length = j - i
                        if length == 1:
                            labels.append(0)
                        else:
                            labels.append(1)
                            labels.extend([2] * (length - 1))
                        i = j
                    collection_labels = labels

                    sentences.append((collection_words, collection_labels))
                    collection_words = []
                    collection_labels = []
                    chunk_tokens = []
                    chunk_ids = []
                continue

            word = columns[1]
            chunk_id = None
            for part in columns[-1].split("|"):
                if part.startswith("ChunkId="):
                    chunk_id = part.split("=")[1]
                    break

            collection_words.append(word)
            chunk_ids.append(chunk_id)

        if collection_words:
            labels = []
            i = 0
            while i < len(chunk_ids):
                cid = chunk_ids[i]
                j = i + 1
                while j < len(chunk_ids) and chunk_ids[j] == cid:
                    j += 1
                length = j - i
                if length == 1:
                    labels.append(0)
                else:
                    labels.append(1)
                    labels.extend([2] * (length - 1))
                i = j
            collection_labels = labels
            sentences.append((collection_words, collection_labels))

    return sentences


In [2]:
train_data_hindi_bio = get_hindi_bio("dataset/hi_hdtb-ud-train.conllu")
test_data_hindi_bio = get_hindi_bio("dataset/hi_hdtb-ud-test.conllu")
develop_data_hindi_bio = get_hindi_bio("dataset/hi_hdtb-ud-dev.conllu")

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer_hindi_bio = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [4]:
example_bio = train_data_hindi_bio[0]
tokenized_input_hindi_bio = tokenizer_hindi_bio(example_bio[0], is_split_into_words=True, truncation=True,padding=True)
tokens_bio = tokenizer_hindi_bio.convert_ids_to_tokens(tokenized_input_hindi_bio["input_ids"])
tokenized_input_hindi_bio

{'input_ids': [0, 4239, 151677, 471, 13353, 33753, 230432, 1302, 421, 646, 967, 460, 207, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
def tokenize_and_align_labels_hindi_bio(example):
    tokenized_input = tokenizer_hindi_bio(
        example["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    word_ids = tokenized_input.word_ids(batch_index=0)  
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])  
        else:
            label_ids.append(example["labels"][word_idx])
        previous_word_idx = word_idx

    tokenized_input["labels"] = label_ids
    return tokenized_input


In [6]:
from datasets import Dataset

train_dataset_hindi_bio = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in train_data_hindi_bio])
test_dataset_hindi_bio = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in test_data_hindi_bio])
develop_dataset_hindi_bio = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in develop_data_hindi_bio])

In [7]:
train_dataset_hindi_bio[0]

{'tokens': ['‡§Ø‡§π',
  '‡§è‡§∂‡§ø‡§Ø‡§æ',
  '‡§ï‡•Ä',
  '‡§∏‡§¨‡§∏‡•á',
  '‡§¨‡§°‡§º‡•Ä',
  '‡§Æ‡§∏‡•ç‡§ú‡§ø‡§¶‡•ã‡§Ç',
  '‡§Æ‡•á‡§Ç',
  '‡§∏‡•á',
  '‡§è‡§ï',
  '‡§π‡•à',
  '‡•§'],
 'labels': [1, 2, 2, 1, 2, 2, 2, 2, 0, 0, 0]}

In [8]:
tokenized_train_dataset_hindi_bio = train_dataset_hindi_bio.map(tokenize_and_align_labels_hindi_bio, batched=False)
tokenized_test_dataset_hindi_bio = test_dataset_hindi_bio.map(tokenize_and_align_labels_hindi_bio, batched=False)
tokenized_develop_dataset_hindi_bio = develop_dataset_hindi_bio.map(tokenize_and_align_labels_hindi_bio, batched=False)

Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

In [9]:
tokenized_train_dataset_hindi_bio[0]

{'tokens': ['‡§Ø‡§π',
  '‡§è‡§∂‡§ø‡§Ø‡§æ',
  '‡§ï‡•Ä',
  '‡§∏‡§¨‡§∏‡•á',
  '‡§¨‡§°‡§º‡•Ä',
  '‡§Æ‡§∏‡•ç‡§ú‡§ø‡§¶‡•ã‡§Ç',
  '‡§Æ‡•á‡§Ç',
  '‡§∏‡•á',
  '‡§è‡§ï',
  '‡§π‡•à',
  '‡•§'],
 'labels': [-100, 1, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 0, -100],
 'input_ids': [0,
  4239,
  151677,
  471,
  13353,
  33753,
  230432,
  1302,
  421,
  646,
  967,
  460,
  207,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenized_train_dataset_hindi_bio = tokenized_train_dataset_hindi_bio.remove_columns(["tokens"])
tokenized_test_dataset_hindi_bio  = tokenized_test_dataset_hindi_bio.remove_columns(["tokens"])
tokenized_dev_dataset_hindi_bio   = tokenized_develop_dataset_hindi_bio.remove_columns(["tokens"])

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator_hindi_bio = DataCollatorForTokenClassification(tokenizer_hindi_bio)

In [12]:
label2id_bio = {"O": 0, "B": 1 ,"I":2}
id2label_bio = {0: "O", 1: "B" , 2:"I"}

In [13]:
model_hindi_bio = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels= 3,
    id2label=id2label_bio,
    label2id=label2id_bio
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args_hindi_bio = TrainingArguments(
    output_dir="./model_hindi_bio",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer_hindi_bio = Trainer(
    model=model_hindi_bio,
    args=training_args_hindi_bio,
    train_dataset=tokenized_train_dataset_hindi_bio,
    eval_dataset=tokenized_test_dataset_hindi_bio,
    tokenizer=tokenizer_hindi_bio,
    data_collator=data_collator_hindi_bio,
    compute_metrics=compute_metrics,
)

trainer_hindi_bio.train()
trainer_hindi_bio.save_model("./hindi_model_bio")
tokenizer_hindi_bio.save_pretrained("./hindi_model_bio")



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.119,0.08931,0.967982,0.966086,0.967033,0.981239
2,0.0602,0.074778,0.974907,0.976422,0.975664,0.986344
3,0.0267,0.076523,0.978709,0.978246,0.978478,0.987325


('./hindi_mode_bio/tokenizer_config.json',
 './hindi_mode_bio/special_tokens_map.json',
 './hindi_mode_bio/tokenizer.json')

The structure of the code here follows Part 1, except for the data reading section.

# Part3‚Äì Performance Analysis

&nbsp;&nbsp;&nbsp;&nbsp;The English chunk model achieved moderate precision, low recall and F1, but extremely high accuracy, indicating that while it correctly predicts many non-chunk tokens, it struggles with identifying actual chunks. In contrast, the Hindi chunk model performed impressively, achieving precision, recall, F1, and accuracy of 0.9787, 0.9782, 0.9785, and 0.9873 respectively, reflecting both strong overall performance and balanced token-level predictions.

# Bonus‚Äì Using BERT with Our Data

In [2]:
def read_hindi_data(inputfilename):
    with open(inputfilename, "r", encoding="utf-8") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []

        for line in inputfile:
            if line.startswith("#"):
                continue

            columns = line.split()
            if not columns:
                if collection_words:
                    sentences.append((collection_words, collection_labels))
                    collection_words = []
                    collection_labels = []
                continue

            word = columns[1]
            chunk_type = None
            for part in columns[-1].split("|"):
                if part.startswith("ChunkType="):
                    chunk_type = part.split("=")[1]
                    break

            collection_words.append(word)
            if chunk_type == "head":
                collection_labels.append(1)
            else:
                collection_labels.append(0)

        if collection_words:
            sentences.append((collection_words, collection_labels))

    return sentences


In [3]:
train_data_for_hindi = read_hindi_data("dataset/hi_hdtb-ud-train.conllu")
test_data_for_hindi = read_hindi_data("dataset/hi_hdtb-ud-test.conllu")
develop_data_for_hindi = read_hindi_data("dataset/hi_hdtb-ud-dev.conllu")

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer_for_hindi = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [2]:
example_hindi = train_data_for_hindi[0]
tokenized_input_hindi = tokenizer_for_hindi(example_hindi[0], is_split_into_words=True, truncation=True,padding=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input_hindi["input_ids"])
tokenized_input_hindi

NameError: name 'train_data_for_hindi' is not defined

In [None]:
def tokenize_and_align_labels_for_hindi(example):
    tokenized_input = tokenizer_for_hindi(
        example["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    word_ids = tokenized_input.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_input["labels"] = label_ids
    return tokenized_input


In [None]:
from datasets import Dataset

train_dataset_for_hindi = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in train_data_for_hindi])
test_dataset_for_hindi = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in test_data_for_hindi])
develop_dataset_for_hindi = Dataset.from_list([{"tokens": tokens, "labels": labels} for tokens, labels in develop_data_for_hindi])

In [None]:
train_dataset_for_hindi[0]

{'tokens': ['‡§Ø‡§π',
  '‡§è‡§∂‡§ø‡§Ø‡§æ',
  '‡§ï‡•Ä',
  '‡§∏‡§¨‡§∏‡•á',
  '‡§¨‡§°‡§º‡•Ä',
  '‡§Æ‡§∏‡•ç‡§ú‡§ø‡§¶‡•ã‡§Ç',
  '‡§Æ‡•á‡§Ç',
  '‡§∏‡•á',
  '‡§è‡§ï',
  '‡§π‡•à',
  '‡•§'],
 'labels': [0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1]}

In [None]:
tokenized_train_dataset_for_hindi = train_dataset_for_hindi.map(tokenize_and_align_labels_for_hindi, batched=False)
tokenized_test_dataset_for_hindi  = test_dataset_for_hindi.map(tokenize_and_align_labels_for_hindi, batched=False)
tokenized_dev_dataset_for_hindi   = develop_dataset_for_hindi.map(tokenize_and_align_labels_for_hindi, batched=False)

Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset_for_hindi[0]

{'tokens': ['‡§Ø‡§π',
  '‡§è‡§∂‡§ø‡§Ø‡§æ',
  '‡§ï‡•Ä',
  '‡§∏‡§¨‡§∏‡•á',
  '‡§¨‡§°‡§º‡•Ä',
  '‡§Æ‡§∏‡•ç‡§ú‡§ø‡§¶‡•ã‡§Ç',
  '‡§Æ‡•á‡§Ç',
  '‡§∏‡•á',
  '‡§è‡§ï',
  '‡§π‡•à',
  '‡•§'],
 'labels': [-100, 0, 1, 0, 0, 0, 1, -100, 0, 0, 1, 1, 1, -100],
 'input_ids': [0,
  4239,
  151677,
  471,
  13353,
  33753,
  230432,
  1302,
  421,
  646,
  967,
  460,
  207,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenized_train_dataset_for_hindi = tokenized_train_dataset_for_hindi.remove_columns(["tokens"])
tokenized_test_dataset_for_hindi  = tokenized_test_dataset_for_hindi.remove_columns(["tokens"])
tokenized_dev_dataset_for_hindi   = tokenized_dev_dataset_for_hindi.remove_columns(["tokens"])

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator_for_hindi = DataCollatorForTokenClassification(tokenizer_for_hindi)

In [None]:
label2id = {"Child": 0, "Head": 1}
id2label = {0: "Child", 1: "Head"}

In [None]:
model_for_hindi = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels= 2,
    id2label=id2label,
    label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args_for_hindi = TrainingArguments(
    output_dir="./model_hindi",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer_for_hindi = Trainer(
    model=model_for_hindi,
    args=training_args_for_hindi,
    train_dataset=tokenized_train_dataset_for_hindi,
    eval_dataset=tokenized_test_dataset_for_hindi,
    tokenizer=tokenizer_for_hindi,
    data_collator=data_collator_for_hindi,
    compute_metrics=compute_metrics,
)

trainer_for_hindi.train()
trainer_for_hindi.save_model("./hindi_model_head_child")
tokenizer_for_hindi.save_pretrained("./hindi_model_head_child")



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.057,0.041887,0.992285,0.989951,0.991117,0.990629
2,0.0364,0.033155,0.995969,0.990379,0.993166,0.992803
3,0.014,0.035918,0.994861,0.993319,0.994089,0.993762


('./hindi_model_head_child/tokenizer_config.json',
 './hindi_model_head_child/special_tokens_map.json',
 './hindi_model_head_child/tokenizer.json')

&nbsp;&nbsp;&nbsp;&nbsp;Since this is a simple binary classification task, I chose to train a separate classifier. The results show that it achieves excellent performance (Precision: 0.994861, Recall: 0.993319, F1: 0.994089, Accuracy: 0.993762). The code structure follows Part 1 and Part 2, with only minor changes for reading the data and defining the label mappings: