In [3]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

label_list = ["O", "ANS"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

max_length = 384
doc_stride = 128


def create_token_labels(examples):
    # Build token-level labels for the context part only; question tokens get label -100
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offsets_mapping = tokenized.pop("offset_mapping")

    labels = []
    for i, offsets in enumerate(offsets_mapping):
        sequence_ids = tokenized.sequence_ids(i)
        sample_idx = sample_mapping[i]
        answer_start = examples["answer_start"][sample_idx]
        answer_text = examples["answer"][sample_idx]
        answer_end = -1 if answer_start == -1 else answer_start + len(answer_text)

        example_labels = []
        context_id = 1 if pad_on_right else 0
        for idx, offset in enumerate(offsets):
            if sequence_ids[idx] is None:
                example_labels.append(-100)
            elif sequence_ids[idx] != context_id:
                example_labels.append(-100)
            else:
                if answer_start == -1 or offset is None:
                    example_labels.append(label_to_id["O"])  # unanswerable → no tokens
                else:
                    start, end = offset
                    if start >= answer_end or end <= answer_start:
                        example_labels.append(label_to_id["O"])  # outside answer span
                    else:
                        example_labels.append(label_to_id["ANS"])  # overlaps answer span
        labels.append(example_labels)

    tokenized["labels"] = labels
    return tokenized


def compute_token_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    # Only evaluate on context tokens (labels != -100)
    true_labels = []
    pred_labels = []
    for p, l in zip(preds, labels):
        for pi, li in zip(p, l):
            if li != -100:
                true_labels.append(li)
                pred_labels.append(pi)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, labels=[label_to_id["ANS"]], average="binary")
    acc = accuracy_score(true_labels, pred_labels)
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": acc}


results_by_lang = {}
for lang in languages:
    print(f"Training token-classifier for language: {lang}")
    lang_train = train_dataset.filter(lambda ex: ex["lang"] == lang)
    lang_val = val_dataset.filter(lambda ex: ex["lang"] == lang)

    tokenized_lang_train = lang_train.map(create_token_labels, batched=True, remove_columns=lang_train.column_names)
    tokenized_lang_val = lang_val.map(create_token_labels, batched=True, remove_columns=lang_val.column_names)

    model_tc = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id)
    args_tc = TrainingArguments(
        output_dir=f"seq-lab-{lang}",
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_steps=50,
        save_strategy="no",
        report_to=[],
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer_tc = Trainer(
        model=model_tc,
        args=args_tc,
        train_dataset=tokenized_lang_train,
        eval_dataset=tokenized_lang_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_token_metrics,
    )

    trainer_tc.train()
    metrics = trainer_tc.evaluate()
    results_by_lang[lang] = metrics
    print(f"Results for {lang}: {metrics}")

results_by_lang


Training token-classifier for language: ar


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_tc = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1311,0.101309,0.598485,0.045559,0.084673,0.972713


Results for ar: {'eval_loss': 0.10130859166383743, 'eval_precision': 0.5984848484848485, 'eval_recall': 0.04555940023068051, 'eval_f1': 0.08467309753483387, 'eval_accuracy': 0.972712603645775, 'eval_runtime': 2.9675, 'eval_samples_per_second': 146.926, 'eval_steps_per_second': 18.534, 'epoch': 1.0}
Training token-classifier for language: ko


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_tc = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0901,0.113132,0.630662,0.111453,0.18943,0.967272


Results for ko: {'eval_loss': 0.11313216388225555, 'eval_precision': 0.6306620209059234, 'eval_recall': 0.11145320197044335, 'eval_f1': 0.18942961800104657, 'eval_accuracy': 0.9672723431227551, 'eval_runtime': 2.3927, 'eval_samples_per_second': 151.295, 'eval_steps_per_second': 19.225, 'epoch': 1.0}
Training token-classifier for language: te


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_tc = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1181,0.078299,0.514286,0.015831,0.030717,0.980213


Results for te: {'eval_loss': 0.07829934358596802, 'eval_precision': 0.5142857142857142, 'eval_recall': 0.0158311345646438, 'eval_f1': 0.030716723549488054, 'eval_accuracy': 0.9802125065319631, 'eval_runtime': 2.5508, 'eval_samples_per_second': 152.108, 'eval_steps_per_second': 19.209, 'epoch': 1.0}


{'ar': {'eval_loss': 0.10130859166383743,
  'eval_precision': 0.5984848484848485,
  'eval_recall': 0.04555940023068051,
  'eval_f1': 0.08467309753483387,
  'eval_accuracy': 0.972712603645775,
  'eval_runtime': 2.9675,
  'eval_samples_per_second': 146.926,
  'eval_steps_per_second': 18.534,
  'epoch': 1.0},
 'ko': {'eval_loss': 0.11313216388225555,
  'eval_precision': 0.6306620209059234,
  'eval_recall': 0.11145320197044335,
  'eval_f1': 0.18942961800104657,
  'eval_accuracy': 0.9672723431227551,
  'eval_runtime': 2.3927,
  'eval_samples_per_second': 151.295,
  'eval_steps_per_second': 19.225,
  'epoch': 1.0},
 'te': {'eval_loss': 0.07829934358596802,
  'eval_precision': 0.5142857142857142,
  'eval_recall': 0.0158311345646438,
  'eval_f1': 0.030716723549488054,
  'eval_accuracy': 0.9802125065319631,
  'eval_runtime': 2.5508,
  'eval_samples_per_second': 152.108,
  'eval_steps_per_second': 19.209,
  'epoch': 1.0}}