In [None]:
from datasets import load_dataset, Dataset, DatasetDict, Features, Value
import os, csv
import pandas as pd
import numpy as np

data_dir = "../data"
langs = ["NR","SS","XH","ZU"]
datasets = {}

column_names = ["word", "parsed", "morpheme", "tag"]
for lang in langs:
    lang_set = {
        "TRAIN": pd.read_csv(f"../data/TRAIN/{lang}_TRAIN.tsv", delimiter="\t", quoting=csv.QUOTE_NONE, names=column_names)
        ,
        "TEST": pd.read_csv(f"../data/TEST/{lang}_TEST.tsv", delimiter="\t", quoting=csv.QUOTE_NONE, names=column_names,)
        ,
    }

    datasets[lang] = lang_set

In [None]:
print("loaded the datasets")

In [None]:
mappings = {}
mappings_r = {}
count = 0
def extract_tag(seq: str) -> str:
    global mappings, count
    seq = seq.split("_")
    for i, tag in enumerate(seq):
        if tag not in mappings.keys():
            mappings[tag] = count
            mappings_r[count] = tag
            count+=1
        seq[i] = mappings[tag]
    return seq

In [None]:
for lang in langs:
    for item in ["TEST", "TRAIN"]:
        df = datasets[lang][item]
        df['morpheme'] = df['morpheme'].apply(lambda x: x.split("_"))
        df['tag'] = df['tag'].apply(lambda x: extract_tag(x))

datasets

In [None]:
print("mapped the input")

In [None]:
for lang in langs:
    lang_set = {
        "train": Dataset.from_pandas(datasets[lang]["TRAIN"]),
        "test": Dataset.from_pandas(datasets[lang]["TEST"]),
    }

    datasets[lang] = DatasetDict(lang_set)

In [None]:
print("datasets created")
datasets

In [None]:
from transformers import XLMRobertaTokenizerFast
checkpoint = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(checkpoint)

In [None]:
for i in range(10):
    example_text = datasets["NR"]["train"][i]
    t_input = tokenizer(example_text["morpheme"], is_split_into_words=True)
    print(len(example_text["tag"]), len(t_input["input_ids"]), len(t_input.word_ids()))

In [None]:
def tokenize_and_align(example, label_all_tokens = True):
    tokenized_input = tokenizer(example["morpheme"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(example["tag"]):
        word_ids = tokenized_input.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_input["labels"] = labels
    return tokenized_input

In [None]:
tokenized_dataset = datasets["NR"].map(tokenize_and_align, batched=True)

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(mappings))

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="test-parse",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01
)

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
labels = ["_-"+mappings_r[i] for i in example_text["tag"]]
labels, example_text["tag"]

In [None]:
import seqeval
from datasets import load_metric
import seqeval.metrics

metric = load_metric("seqeval")

metric.compute(predictions=[labels], references=[labels])
# seqeval.metrics.precision_score([labels],[labels])

In [None]:
import numpy as np
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)

    predictions = [
        ["_-"+mappings_r[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        ["_-"+mappings_r[l] for (eval_preds, l) in zip(prediction, label) if l != -100] for prediction, label in zip(pred_logits, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"] ,
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("parse_model")
tokenizer.save_pretrained("tokenizer")

In [None]:
import json

config = json.load(open("parse_model/config.json"))
config["id2label"] = mappings
config["label2id"] = mappings_r

json.dump(config, open("parse_model/config.json", "w"))