In [5]:
from transformers import AutoModelForTokenClassification
import time

In [3]:

from transformers import AutoTokenizer
from notebooks.CoNLL_model import label2id, label_list, id2label

model_names = [
    "xlm-roberta-base",
    "bert-base-multilingual-cased",
    "Davlan/afroxlmr-base"
]


results = []

for model_name in model_names:
    print(f"\nFine-tuning model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)

    def encode(example):
        tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
        word_ids = tokenized_inputs.word_ids()
        labels = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)
            elif word_id != prev_word:
                labels.append(label2id[example["ner_tags"][word_id]])
            else:
                labels.append(label2id[example["ner_tags"][word_id]])
            prev_word = word_id
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    encoded_dataset = dataset.map(encode, batched=False)

    training_args = TrainingArguments(
        output_dir=f"./models/{model_name.replace('/', '_')}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["test"],
        tokenizer=tokenizer,
    )

    start_time = time.time()
    trainer.train()
    elapsed_time = time.time() - start_time

    metrics = trainer.evaluate()
    metrics["model_name"] = model_name
    metrics["train_time_sec"] = elapsed_time

    results.append(metrics)

# Sort models by evaluation loss
results = sorted(results, key=lambda x: x["eval_loss"])
print("\n\n📊 Model Comparison Results:")
for res in results:
    print(f"{res['model_name']} => Loss: {res['eval_loss']:.4f}, Time: {res['train_time_sec']:.2f} sec")


Fine-tuning model: xlm-roberta-base


NameError: name 'AutoTokenizer' is not defined