### 

In [None]:
# 1. Setup Environment (in Google Colab)
!pip install transformers datasets seqeval accelerate -q
!pip install -q git+https://github.com/huggingface/transformers.git


In [None]:
# 2. Load CoNLL formatted data and preprocess
from datasets import load_dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer


In [None]:
model_checkpoint = "Davlan/afroxlmr-base"  # or 'Davlan/bert-tiny-amharic'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load your own CoNLL file manually
from datasets import Dataset

def read_conll(path):
    sentences, labels = [], []
    with open(path, encoding="utf-8") as f:
        tokens, tags = [], []
        for line in f:
            if line.strip() == "":
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                tags.append(splits[1])
    return pd.DataFrame({"tokens": sentences, "ner_tags": labels})

train_df = read_conll("labeled_conll.txt")
dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.2)

label_list = list(set(tag for row in train_df["ner_tags"] for tag in row))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def encode(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    labels = []
    prev_word = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word:
            labels.append(label2id[example["ner_tags"][word_id]])
        else:
            labels.append(label2id[example["ner_tags"][word_id]])
        prev_word = word_id
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

encoded_dataset = dataset.map(encode, batched=False)



In [None]:
# 3. Fine-tune the model
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

In [None]:
# Model Interpretability (SHAP + LIME for Transformers)
!pip install shap lime -q

import shap
from transformers import pipeline

ner_pipeline = pipeline("ner", model="./ner_model", tokenizer="./ner_model", grouped_entities=True)
explainer = shap.Explainer(ner_pipeline)
shap_values = explainer(["ዋጋ 1000 ብር ቦሌ ሞል ላይ LCD Tablet አለ"])
shap.plots.text(shap_values[0])