In [None]:
# Notebook 3: Fine-Tuning NER Model (Amharic)
# ============================================

# Install required packages
!pip install transformers datasets seqeval torch accelerate

import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

# ==============================
# 1. Load Labeled CoNLL Data
# ==============================
def read_conll(file_path):
    """
    Read a CoNLL formatted file into a list of dicts
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.read().split("\n")

    tokens, labels = [], []
    data = []
    for line in lines:
        if line.strip() == "":
            if tokens:
                data.append({"tokens": tokens, "labels": labels})
                tokens, labels = [], []
        else:
            splits = line.split()
            tokens.append(splits[0])
            labels.append(splits[1])
    return data

train_data = read_conll("data/labeled/ner_train.conll")

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(train_data)

# ==============================
# 2. Load Tokenizer & Model
# ==============================
model_name = "xlm-roberta-base"  # multilingual model, works with Amharic
tokenizer = AutoTokenizer.from_pretrained(model_name)
label_list = ["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label_list.index(label[word_idx]))
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# ==============================
# 3. Training Setup
# ==============================
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
data_collator = DataCollatorForTokenClassificati_
