In [None]:
from datasets import load_dataset
rawsets = load_dataset('papluca/language-identification')
id2label = sorted(set(rawsets['train']['labels']))

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [None]:
def preprocess_func(examples):
    tokenized_example = tokenizer(examples['text'], truncation=True)
    tokenized_example['labels'] = [id2label.index(l) for l in examples['labels']]
    return tokenized_example

In [None]:
tokenized_sets = rawsets.map(preprocess_func, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(id2label))

In [None]:
from kheiron import Trainer, TrainingOptions

options = TrainingOptions(task='text-classification',
                          train_batch_size=8,
                          eval_batch_size=8,
                          metric_for_best_model='macro_f1',
                          greater_is_better=True,
                          track_metrics=True)

trainer = Trainer(model=model,
                  opts=options,
                  train_set=tokenized_sets['train'],
                  eval_set=tokenized_sets['test'],
                  collate_fn=data_collator)

In [None]:
trainer.train()