In [None]:
# Notebook 4: Model Comparison & Selection for Amharic NER
# ========================================================

# Install required packages
!pip install transformers datasets seqeval torch accelerate

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

# ==============================
# 1. Define Models to Compare
# ==============================
model_names = [
    "xlm-roberta-base",  # Large multilingual model
    "bert-base-multilingual-cased",  # Multilingual BERT
    "afroxmlr/bert-tiny-amharic"  # Small Amharic-specific model
]

label_list = ["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]

# ==============================
# 2. Load Preprocessed Dataset
# ==============================
# Load your tokenized dataset from Notebook 3
dataset = Dataset.from_file("data/labeled/ner_train.conll")  # Adjust if necessary

# Function to tokenize and align labels (reuse from Notebook 3)
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label_list.index(label[word_idx]))
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# ==============================
# 3. Compare Models
# ==============================
results_summary = []

for model_name in model_names:
    print(f"\nTraining and evaluating model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir=f"./logs/{model_name.replace('/', '_')}",
        logging_steps=10
    )

    def compute_metrics(p):
        predictions, labels = p
        predictions = predictions.argmax(axis=-1)
        true_labels, true_predictions = [], []

        for i, label in enumerate(labels):
            true_labels.append([label_list[l] for l in label if l != -100])
            true_predictions.append([label_list[p] for (p, l) in zip(predictions[i], label) if l != -100])
        return {
            "accuracy": accuracy_score(true_labels, true_predictions),
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions)
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    results_summary.append({"model": model_name, **metrics})

# ==============================
# 4. Compare Results
# ==============================
df_results = pd.DataFrame(results_summary)
print("\nModel Comparison Summary:")
print(df_results)

# Optionally save results
df_results.to_csv("data/ner_model_comparison.csv", index=False)
