In [None]:
import numpy as np
import evaluate

from datasets import load_dataset, Dataset

from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer


# Part1 Transformers for Sequence Classification

In [None]:
# savind the model and datasetname as constants
MODEL_ENGLISH="distilbert-base-uncased"
DATASET_ENGLISH="wnut_17"


In [None]:
#Load the dataset
english_dataset=load_dataset(DATASET_ENGLISH)

In [None]:
#ImportModel
tokenizer=AutoTokenizer.from_pretrained(MODEL_ENGLISH)

In [None]:
# Code taken from the tutorial
def preprocessing(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = english_dataset.map(preprocessing, batched=True)

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [None]:
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer) # Code taken from the tutorial

In [None]:
seqeval = evaluate.load("seqeval")

In [None]:

label_list = english_dataset["train"].features[f"ner_tags"].feature.names

#Use a dictionary comprehension to create the maps
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}


print(id2label)

{0: 'O', 1: 'B-corporation', 2: 'I-corporation', 3: 'B-creative-work', 4: 'I-creative-work', 5: 'B-group', 6: 'I-group', 7: 'B-location', 8: 'I-location', 9: 'B-person', 10: 'I-person', 11: 'B-product', 12: 'I-product'}


In [None]:

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

In [None]:
# Code chunk taken from the tutorial


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# Code chunk taken from tutorial
training_args = TrainingArguments(
    output_dir="English_Model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.294205,0.411439,0.206673,0.275139,0.935787
2,No log,0.27317,0.590909,0.301205,0.399018,0.94096
3,No log,0.270342,0.558214,0.324374,0.410317,0.943141
4,No log,0.272618,0.523055,0.336423,0.409475,0.94344
5,No log,0.266524,0.508108,0.348471,0.413414,0.944167


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=270, training_loss=0.11503364421703198, metrics={'train_runtime': 194.7263, 'train_samples_per_second': 87.148, 'train_steps_per_second': 1.387, 'total_flos': 259035045125820.0, 'train_loss': 0.11503364421703198, 'epoch': 5.0})

# Part 2 Using BERT with Our Data

In [None]:
import pathlib
# Root to the data
ROOT = pathlib.Path("/srv/data/lt2326-h25/a2")
ROOT.exists()


In [None]:
from pathlib import Path

train_file = Path("/srv/data/lt2326-h25/a2/hi_hdtb-ud-train.conllu")

with train_file.open(encoding="utf-8") as f:
    for i in range(30):
        print(f.readline().rstrip())


In [None]:
#Define path to the splits
TRAIN_PATH = Path("/srv/data/lt2326-h25/a2/hi_hdtb-ud-train.conllu")
DEV_PATH   = Path("/srv/data/lt2326-h25/a2/hi_hdtb-ud-dev.conllu")
TEST_PATH  = Path("/srv/data/lt2326-h25/a2/hi_hdtb-ud-test.conllu")


Considering that the data came in the CONLLU format and that Transfomer models require the data to be in a certain format, after defining the paths to the dataset, I created a function that goes through each sentence in the dataset (considered that if there is a line or a comment, then it is a new sentence), and collect the tokens and add them to a list (sentences), collect the MISC Field which contains ChunkId, which indicates the semantic group the token belongs to. Additionally, since the dataset does not explicitly have IOB labels, I had to generate them based on the ChunkID, as follows: O if the ChunkID is “O”, I if the ChunkID matches the previous one, hence being part of the same entity, and B if the ChunkID does not match the previous one, being part of a different entity.




In [None]:
def read_conllu_iob(path):
    #Initialize containers for sentences and their token-level IOB labels.
    sentences, iob_labels = [], []
    tokens, labels = [], []
    prev_chunk_id = None

    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            #If blank line or comment consider it a new sentence
            if not line or line.startswith("#"):
                if tokens:
                    sentences.append(tokens)
                    iob_labels.append(labels)
                    tokens, labels, prev_chunk_id = [], [], None
                continue

            cols = line.split("\t")
            if len(cols) < 10:
                continue

            token = cols[1]
            misc = cols[9]
            #Extract ChunkId, which identifies which chunk the token belongs to.
            curr_chunk_id = "O"
            if misc != "_":
                misc_data = dict(
                    item.split("=") for item in misc.split("|") if "=" in item
                )
                curr_chunk_id = misc_data.get("ChunkId", "O")
            # Reconstruct IOB tags based on transitions between ChunkIds.
            if curr_chunk_id == "O":
                iob_label = "O"
            elif curr_chunk_id != prev_chunk_id:
                iob_label = f"B-{curr_chunk_id}"
            else:
                iob_label = f"I-{curr_chunk_id}"

            tokens.append(token)
            labels.append(iob_label)
            prev_chunk_id = curr_chunk_id

    # catch last sentence
    if tokens:
        sentences.append(tokens)
        iob_labels.append(labels)

    return sentences, iob_labels


In [None]:
#transofrm all splits
train_sentences, train_labels = read_conllu_iob(TRAIN_PATH)
dev_sentences, dev_labels     = read_conllu_iob(DEV_PATH)
test_sentences, test_labels   = read_conllu_iob(TEST_PATH)




Subsequently, I transformed the labels from strings to integers, and align the tokens and labels, similar to how is done in the tutorial. I did take the labels from all splits, as when I was trying to load the models, there were labels that were found in the dev set, but not in the train set. Aditionally I transfomred the data in order to be able to use the Dataset class from huggingface.


In [None]:
#Transform labels from string to int
all_labels = (
    train_labels
    + dev_labels
    + test_labels
)

label_list = sorted(set(l for sent in all_labels for l in sent))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}



In [None]:


def make_dataset(sentences, labels, tokenizer):
    encodings = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        max_length=256,
    )

    aligned_labels = []
    for i, label_seq in enumerate(labels):
        word_ids = encodings.word_ids(batch_index=i)
        prev = None
        label_ids = []

        for w in word_ids:
            if w is None:
                label_ids.append(-100)
            elif w != prev:
                label_ids.append(label2id[label_seq[w]])
            else:
                label_ids.append(-100)
            prev = w

        # ✅ this must be OUTSIDE the inner loop
        aligned_labels.append(label_ids)

    encodings["labels"] =


Prediction metrics for hindi

In [None]:
def compute_metrics_hindi(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# 1st Model

When it was about training the model, I did encounter a big problem, that of running out of GPU since  I wanted to work with ROBERTa Hindi. Hence I decided to go with a relatively smaller models, both distilbert, one being a [md-nishat-008 Mixed-Distil-BERT](https://huggingface.co/md-nishat-008/Mixed-Distil-BERT), which I decided to use since its trained on a good amount of data, 560k, however, the data it is a code-mixed of english-hindi-bengali.  


In [None]:
MODEL_NAME = "md-nishat-008/Mixed-Distil-BERT"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = make_dataset(train_sentences, train_labels, tokenizer)
dev_dataset   = make_dataset(dev_sentences, dev_labels, tokenizer)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list), # Classification head
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at md-nishat-008/Mixed-Distil-BERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Training
#Decided to go with only 2 epoch due to the GPU issues
training_args = TrainingArguments(
    output_dir="./mixed-distilbert-hindi",
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    logging_steps=50,

)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_hindi,
)


trainer.train()

# 2nd Model
For the second Model I went with  [distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) as it is a fairly used model, trained on multiple languages, and on bigger amount of data. I believe it is a good model for comparasion

In [None]:
MODEL_NAME_2 = "distilbert-base-multilingual-cased"

tokenizer_2 = AutoTokenizer.from_pretrained(MODEL_NAME_2)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



In [None]:
train_dataset_2 = make_dataset(train_sentences, train_labels, tokenizer_2)
dev_dataset_2   = make_dataset(dev_sentences, dev_labels, tokenizer_2)


In [None]:
model_2 = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME_2,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args_2 = TrainingArguments(
    output_dir="./distilbert-multilingual-hindi",
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    logging_steps=50,

)


In [None]:
trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=train_dataset_2,
    eval_dataset=dev_dataset_2,
    tokenizer=tokenizer_2,
    data_collator=DataCollatorForTokenClassification(tokenizer_2),
    compute_metrics=compute_metrics_hindi,

)


In [None]:
trainer_2.train()


# Part 3
Looking at both models from paer one we can see that second model performed better than the first model in all metrics. However that does not come as a suprise, firstly, because as mentioned the first model is trained on code-mixed language, and we do not know exactly how much percentage was actually hindi, compared with the second model that is trained on way more data, and its trained on wikipedia data, where hindi it is quite used.

In [None]:
test_dataset = make_dataset(test_sentences, test_labels, tokenizer)
test_dataset_2 = make_dataset(test_sentences, test_labels, tokenizer_2)

In [None]:
test_results_model_1 = trainer.evaluate(test_dataset)
test_results_model_2 = trainer_2.evaluate(test_dataset_2)

print("Model1:", test_results_model_1)
print("Model2:", test_results_model_2)