In [2]:
!pip install -q seqeval


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
# loading the libraries
import os
import pandas as pd
from datasets import Dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np
import logging
from itertools import chain
from pathlib import Path
from datasets import load_dataset,concatenate_datasets

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [5]:
file_path = [
    '/content/drive/MyDrive/Colab Notebooks/Kifiya Challenge/week4/data/labelled_data.conll',
    '/content/drive/MyDrive/Colab Notebooks/Kifiya Challenge/week4/data/amharic_ner.conll',
    '/content/drive/MyDrive/Colab Notebooks/Kifiya Challenge/week4/data/ner_auto_labels.conll',
]

In [None]:
# load the models
MODEL_NAME_1 = "/content/drive/MyDrive/week4/model/xlm-roberta-model"
MODEL_NAME_2 = "/content/drive/MyDrive/week4/model/bert-tiny-amh-model"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_1)

In [None]:
def parse_conll_file(filepath):
    tokens, tags = [], []
    sentence, label_seq = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    tokens.append(sentence)
                    tags.append(label_seq)
                    sentence, label_seq = [], []
            else:
                if len(line.split()) == 2:
                    token, label = line.split()
                    sentence.append(token)
                    label_seq.append(label)
    # Add last sentence
    if sentence:
        tokens.append(sentence)
        tags.append(label_seq)

    # ✅ Return DataFrame
    return pd.DataFrame({'tokens': tokens, 'ner_tags': tags})

In [None]:
# Load each file
dataset_list = []
for path in file_path:
    try:
        df = parse_conll_file(path)
        if not df.empty:
            dataset = Dataset.from_pandas(df)
            dataset_list.append(dataset)
            print(f" Loaded {len(df)} sentences from: {path}")
        else:
            print(f"File is empty: {path}")
    except Exception as e:
        print(f" Error reading {path}: {e}")

# Combine datasets
if dataset_list:
    c_dataset = concatenate_datasets(dataset_list)
    print(" Combined dataset with total samples:", len(c_dataset))
else:
    print("No datasets were successfully loaded.")

In [None]:
# Flatten all tags into a single list
all_tags = list(chain.from_iterable(c_dataset['ner_tags']))
unique_tags = sorted(set(all_tags))  # -> ENTITY_LABELS
label2id = {label: idx for idx, label in enumerate(unique_tags)}
id2label = {idx: label for label, idx in label2id.items()}
print("Label2ID mapping:", label2id)

#  Encode ner_tags to IDs
def encode_labels(example):
    return {"ner_tags": [label2id[tag] for tag in example["ner_tags"]]}

# Apply label encoding to dataset
eval_dataset = c_dataset.map(encode_labels)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label] for label in example if label != -100]
        for example in labels
    ]
    true_preds = [
        [id2label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]

    return {
        "f1": f1_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds)
    }


In [None]:
for model_path in [MODEL_NAME_1, MODEL_NAME_2]:
    print(f"\ Evaluating model: {model_path}")
    model = AutoModelForTokenClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
    )
    trainer = Trainer(model=model, tokenizer=tokenizer, compute_metrics=compute_metrics)
    results = trainer.evaluate(eval_dataset)
    print(f"Results for {model_path}: {results}")