In [1]:
from datasets_hutils import generate_mixed_hindi_nepali_dataset
import evaluate
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
from tokenization_utils import tokenize_and_match_word_ids_with_labels, muril_tokenizer
import numpy as np

metric = evaluate.load("seqeval")


In [2]:
mixed_nepali_hindi_dataset = generate_mixed_hindi_nepali_dataset()
mixed_nepali_hindi_dataset

Labelling नेपाली/हिंदी sentences: 100%|##########| 46432/46432 [00:00<?, ? examples/s]

Mixing labelled sentences: 100%|##########| 46432/46432 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['मिश्रित_वाक्य', 'चिन्ह'],
        num_rows: 46432
    })
})

In [None]:
mixed_dataset_tokenized = mixed_nepali_hindi_dataset.map(tokenize_and_match_word_ids_with_labels, batched=True)
# Finally now, maybe we can do some training. The data handling and tokenization steps are complete.

final_dataset = mixed_dataset_tokenized.remove_columns(["मिश्रित_वाक्य", "चिन्ह"])

#max_train_examples = 200  # adjust as needed
#train_base = final_dataset["train"].shuffle(seed=42)
#train_base = train_base.select(range(min(max_train_examples, len(train_base))))
#final_dataset = train_base.train_test_split(test_size=0.1, seed=42)

final_dataset = final_dataset["train"].train_test_split(test_size=0.1)

final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 180
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
})

In [4]:
#padded_batch = classification_collator([mixed_dataset_tokenized["train"][i] for i in range(2)])
classification_collator = DataCollatorForTokenClassification(tokenizer=muril_tokenizer)


In [5]:
   label_names = ["B-NEP", "I-NEP", "B-HI", "I-HI"]
   id2label = {i: name for i, name in enumerate(label_names)}
   label2id = {name: i for i, name in id2label.items()}

In [6]:
def compute_metrics(p):
    logits, labels = p
    # predictions and labels are lists of lists, i.e batched. They are batched for each sentence.
    predictions = np.argmax(logits, axis=-1) # convert the logits to predcited labels
    # convert the labels to the label names
    labels = [[label_names[each_label] for each_label in each_sentence_labels if each_label != -100] for each_sentence_labels in labels]
    # labels is a list of lists, each list is a sentence with labels as string instead of ids.

    # then convert the predicted labels to the label names
    predictions = [[label_names[each_prediction] for each_prediction, each_label in zip(each_sentence_predictions, each_sentence_labels) if each_label != -100] for each_sentence_predictions, each_sentence_labels in zip(predictions, labels)]

    metrics_result = metric.compute(predictions=predictions, references=labels)
    
    return {
        "precision": metrics_result["overall_precision"],
        "recall": metrics_result["overall_recall"],
        "f1": metrics_result["overall_f1"],
        "accuracy": metrics_result["overall_accuracy"],
    }
    

In [7]:
# Now create the training arguments

model = AutoModelForTokenClassification.from_pretrained(
    "google/muril-base-cased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
)

training_args = TrainingArguments(
    "hindi-nepali-token-classification",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    data_collator=classification_collator,
    compute_metrics=compute_metrics,
    processing_class=muril_tokenizer
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:

trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.3245,0.407407,0.511628,0.453608,0.481481
2,No log,1.30387,0.407407,0.511628,0.453608,0.481481
3,No log,1.296399,0.407407,0.511628,0.453608,0.481481


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=69, training_loss=1.3328271395918252, metrics={'train_runtime': 38.4235, 'train_samples_per_second': 14.054, 'train_steps_per_second': 1.796, 'total_flos': 19824208488864.0, 'train_loss': 1.3328271395918252, 'epoch': 3.0})

In [9]:
trainer.push_to_hub(commit_message="Training complete")


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/Emalper/hindi-nepali-token-classification/commit/3e20364fdfe41d42a55efaac9186a583ddf01c0b', commit_message='Training complete', commit_description='', oid='3e20364fdfe41d42a55efaac9186a583ddf01c0b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Emalper/hindi-nepali-token-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='Emalper/hindi-nepali-token-classification'), pr_revision=None, pr_num=None)