In [1]:
import re
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import torch

In [2]:
device = torch.device('cuda:2')

In [3]:
source_path = "../../../../srv/data/lt2326-h25/a2"

In [4]:
def chunk_regex(example):
    function = "ChunkId=[A-z]*"
    x = re.findall(function, example)
    return x[0][8:]

In [5]:
def file_to_DSformat(file_name):
    list_datasetlike = []
    with open(source_path + file_name) as file:
        
        reader = file.read()
        lines = reader.split("\n")
        counter = 0
        sentence_dict = {'id':0}
    
        for line in lines:
            line_split = line.split()
            
            if line == "":
                list_datasetlike.append(sentence_dict)
                sentence_dict = {}
                counter += 1
                sentence_dict['id'] = counter
            
            elif line_split[0] == "#":
                continue
                
            else:
                
                if 'tokens' in sentence_dict.keys():
                    sentence_dict['tokens'].append(line_split[1])
                    
                else:
                    sentence_dict['tokens'] = [line_split[1]]

                if 'chunk_tags' in sentence_dict.keys():
                    sentence_dict['chunk_tags'].append(chunk_regex(line_split[9]))
                    
                else:
                    sentence_dict['chunk_tags'] = [chunk_regex(line_split[9])]
                    
    # removing last empty dictionary from a list
    list_datasetlikeshorter = list_datasetlike[:-1]
    return list_datasetlikeshorter

test_dataset_untkn = file_to_DSformat("/hi_hdtb-ud-test.conllu")[:100]
dev_dataset_untkn = file_to_DSformat("/hi_hdtb-ud-dev.conllu")[:100]
train_dataset_untkn = file_to_DSformat("/hi_hdtb-ud-train.conllu")[:1000]
list_data_untkn = [test_dataset_untkn, dev_dataset_untkn, train_dataset_untkn]

In [6]:
print(len(test_dataset_untkn))
print(len(dev_dataset_untkn))
print(len(train_dataset_untkn))

100
100
1000


In [7]:
def create_token_dict(sets_list):
    word_2_indx = {}
    indx_2_word = {}
    word_counter = 0

    tag_2_indx = {}
    indx_2_tag = {}
    tag_counter = 0

    for ds in sets_list:
        for i in range(len(ds)):

            for word in ds[i]['tokens']:
                if word not in word_2_indx:
                    word_2_indx[word] = word_counter
                    indx_2_word[word_counter] = word
                    word_counter += 1
            for tag in ds[i]['chunk_tags']:
                if tag not in tag_2_indx:
                    tag_2_indx[tag] = tag_counter
                    indx_2_tag[tag_counter] = tag
                    tag_counter += 1

    return word_2_indx, indx_2_word, tag_2_indx, indx_2_tag
word_2_indx, indx_2_word, tag_2_indx, indx_2_tag = create_token_dict(list_data_untkn)

In [8]:
def tokenize(ds):
    big_list = []
    for i in range(len(ds)):
        new_dict = {}
        index = ds[i]['id']
        tokenised_words = []
        tokenised_tags = []
        for word in ds[i]['tokens']:
            tokenised_words.append(word_2_indx[word])
        for tag in ds[i]['chunk_tags']:
            tokenised_tags.append(tag_2_indx[tag])
        new_dict['id'] = index
        new_dict['input_ids'] = tokenised_words
        new_dict['labels'] = tokenised_tags
        big_list.append(new_dict)

    return big_list

In [10]:
test_dataset = tokenize(test_dataset_untkn)

In [11]:
dev_dataset = tokenize(dev_dataset_untkn)

In [12]:
train_dataset = tokenize(train_dataset_untkn)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")

In [14]:
# tokenize_and_align_labels(test_dataset)

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
seqeval = evaluate.load("seqeval")

In [17]:
label_list = list(tag_2_indx.keys())

In [18]:
len(label_list)

9

In [19]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [20]:
# change number of labels and staff, change model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(label_list), id2label=indx_2_tag, label2id=tag_2_indx
).to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# change relevant part, change datasets, make sure right tokeniser and matrix

training_args = TrainingArguments(
    output_dir="hindi_model3",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.676351,0.525597,0.301961,0.383562,0.820302
2,No log,0.619149,0.493421,0.294118,0.36855,0.829904


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=250, training_loss=0.9067395629882813, metrics={'train_runtime': 87.1848, 'train_samples_per_second': 22.94, 'train_steps_per_second': 2.867, 'total_flos': 15815109455712.0, 'train_loss': 0.9067395629882813, 'epoch': 2.0})