# Fine-Tune Amharic NER Model
This notebook fine-tunes a model like AfroXLMR or BERT for NER on Amharic Telegram product data.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets seqeval accelerate pandas evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0

In [None]:
# 📦 Install all dependencies (run only once)
!pip install transformers datasets seqeval evaluate -q



# 🧼 Parse CoNLL .txt file
lines = open("/content/drive/MyDrive/labeled_telegram_product_price_location.txt", "r", encoding="utf-8").read().strip().split("\n")
sentences, tokens, labels = [], [], []
for line in lines:
    if line.strip() == "":
        if tokens:
            sentences.append((tokens, labels))
            tokens, labels = [], []
    else:
        token, label = line.split()
        tokens.append(token)
        labels.append(label)
if tokens:
    sentences.append((tokens, labels))

# 🔄 Convert to Hugging Face Dataset
from datasets import Dataset
label_list = sorted(list({l for _, labs in sentences for l in labs}))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}
hf_data = [{"tokens": t, "ner_tags": [label_to_id[l] for l in labs]} for t, labs in sentences]
dataset = Dataset.from_list(hf_data).train_test_split(test_size=0.1)

# 🔠 Tokenization + Label Alignment
from transformers import AutoTokenizer
checkpoint = "Davlan/bert-base-amharic"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_and_align(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    labels = []
    prev_word = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word = word_id
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align)

# 🧠 Training Setup
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
import os

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(label_list))
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_preds = [[id_to_label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    true_labels = [[id_to_label[l] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    return metric.compute(predictions=true_preds, references=true_labels)

# 🔧 TrainingArguments
args = TrainingArguments(
    output_dir="amharic-ner-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# 🚀 Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 💾 Save final model locally
trainer.save_model("amharic-ner-bert")
tokenizer.save_pretrained("amharic-ner-bert")


In [None]:
# 📦 STEP 1: Install Dependencies
!pip install --upgrade transformers datasets seqeval evaluate -q

# 📂 STEP 2: Upload Your CoNLL File
from google.colab import files
uploaded = files.upload()  # Upload labeled_telegram_product_price_location.txt

# 🧼 STEP 3: Parse CoNLL Format
import pandas as pd

lines = open("/content/drive/MyDrive/labeled_telegram_product_price_location.txt", "r", encoding="utf-8").read().strip().split("\n")
sentences = []
tokens, labels = [], []

for line in lines:
    line = line.strip()
    if not line:
        if tokens:
            sentences.append((tokens, labels))
            tokens, labels = [], []
    else:
        parts = line.split()
        if len(parts) == 2:
            token, label = parts
            tokens.append(token)
            labels.append(label)
if tokens:
    sentences.append((tokens, labels))

# 📊 STEP 4: Prepare HF Dataset
from datasets import Dataset
from collections import defaultdict

label_list = sorted(list({l for _, labs in sentences for l in labs}))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

hf_data = [{"tokens": t, "ner_tags": [label_to_id[l] for l in labs]} for t, labs in sentences]
dataset = Dataset.from_list(hf_data).train_test_split(test_size=0.1)

# 🔤 STEP 5: Tokenize and Align Labels
from transformers import AutoTokenizer
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    labels = []
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 🧠 STEP 6: Model Setup
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_preds = [[id_to_label[p] for p, l in zip(pred, label) if l != -100]
                  for pred, label in zip(predictions, labels)]
    true_labels = [[id_to_label[l] for p, l in zip(pred, label) if l != -100]
                   for pred, label in zip(predictions, labels)]
    return metric.compute(predictions=true_preds, references=true_labels)

# ❌ Disable Weights & Biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

# 🎯 STEP 7: Training Arguments
training_args = TrainingArguments(
    output_dir="xlmr-amharic-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# 🚀 STEP 8: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 💾 STEP 9: Save Model
trainer.save_model("amharic-ner-xlmr")
tokenizer.save_pretrained("amharic-ner-xlmr")
