In [1]:
## 1. Setup Environment
!pip install -q transformers datasets seqeval accelerate
!pip install -q evaluate seqeval transformers datasets
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# loading the libraries
import os
import pandas as pd
from datasets import Dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer,DataCollatorForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np
import torch
import logging
from itertools import chain
from pathlib import Path
from datasets import load_dataset,concatenate_datasets

In [4]:
file_path = [
    '/content/drive/MyDrive/week4/data/labelled_data.conll',
    '/content/drive/MyDrive/week4/data/amharic_ner.conll',
    '/content/drive/MyDrive/week4/data/ner_auto_labels.conll',
]

In [5]:
#file path
model_output='/content/drive/MyDrive/week4/model/xlm-roberta-model'
logging_dir='/content/drive/MyDrive/week4/log'

In [6]:
# define entity types
#ENTITY_LABELS = ['O',
#                 'B-PRODUCT', 'I-PRODUCT',
#                 'B-PRICE', 'I-PRICE',
#                 'B-LOC', 'I-LOC' ]
# Map labels to IDs and vice-versa
#id2label = {i: label for i, label in enumerate(ENTITY_LABELS)}
#label2id = {label: i for i, label in enumerate(ENTITY_LABELS)}


In [7]:
# Load pre-trained XLM-Roberta or bert-tiny-amharic or afroxmlr model
MODEL_NAME = 'xlm-roberta-base'

In [8]:
def parse_conll_file(filepath):
    tokens, tags = [], []
    sentence, label_seq = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    tokens.append(sentence)
                    tags.append(label_seq)
                    sentence, label_seq = [], []
            else:
                if len(line.split()) == 2:
                    token, label = line.split()
                    sentence.append(token)
                    label_seq.append(label)
    # Add last sentence
    if sentence:
        tokens.append(sentence)
        tags.append(label_seq)

    # ✅ Return DataFrame
    return pd.DataFrame({'tokens': tokens, 'ner_tags': tags})

In [9]:
# Load each file
dataset_list = []
for path in file_path:
    try:
        df = parse_conll_file(path)
        if not df.empty:
            dataset = Dataset.from_pandas(df)
            dataset_list.append(dataset)
            print(f" Loaded {len(df)} sentences from: {path}")
        else:
            print(f"File is empty: {path}")
    except Exception as e:
        print(f" Error reading {path}: {e}")

# Combine datasets
if dataset_list:
    c_dataset = concatenate_datasets(dataset_list)
    print(" Combined dataset with total samples:", len(c_dataset))
else:
    print("No datasets were successfully loaded.")

 Loaded 100 sentences from: /content/drive/MyDrive/week4/data/labelled_data.conll
 Loaded 5 sentences from: /content/drive/MyDrive/week4/data/amharic_ner.conll
 Loaded 99 sentences from: /content/drive/MyDrive/week4/data/ner_auto_labels.conll
 Combined dataset with total samples: 204


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [10]:
print(c_dataset)

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 204
})


In [11]:
# test the first sentence
c_dataset[0]

{'tokens': ['ታላቅ',
  'ቅናሽ',
  'Scarlett',
  'England',
  'Hand',
  'Mixer',
  'የእንቁላል',
  'እና',
  'ሊጥ',
  'መምቻ',
  'Telegram',
  'tmeqnashcom'],
 'ner_tags': ['O',
  'B-PRICE',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O']}

In [12]:
# Flatten all tags into a single list
all_tags = list(chain.from_iterable(c_dataset['ner_tags']))
unique_tags = sorted(set(all_tags))  # -> ENTITY_LABELS
label2id = {label: idx for idx, label in enumerate(unique_tags)}
id2label = {idx: label for label, idx in label2id.items()}
print("Label2ID mapping:", label2id)

Label2ID mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'I-PRODUCT': 5, 'O': 6}


In [13]:
# ✅ Encode ner_tags to IDs
def encode_labels(example):
    return {"ner_tags": [label2id[tag] for tag in example["ner_tags"]]}

# Apply label encoding to dataset
c_dataset = c_dataset.map(encode_labels)


Map:   0%|          | 0/204 [00:00<?, ? examples/s]

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True
        )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove -100 labels (used to ignore subword tokens)
    true_labels = [
        [id2label[label] for label in example if label != -100]
        for example in labels
    ]

    true_predictions = [
        [id2label[pred] for pred, label in zip(pred_example, label_example) if label != -100]
        for pred_example, label_example in zip(predictions, labels)
    ]

    return {
        "f1": f1_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [20]:
def fine_tune_model():
    print("🚀 Starting fine-tuning process...")

    # Convert to Hugging Face Dataset if needed
    tokens_list = [item['tokens'] for item in c_dataset]
    ner_tags_list = [item['ner_tags'] for item in c_dataset]

    df_temp = pd.DataFrame({
        'tokens': tokens_list,
        'ner_tags': ner_tags_list
    })
    dataset = Dataset.from_pandas(df_temp)

    # Split dataset
    train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Eval dataset size: {len(eval_dataset)}")

    # Tokenize and align labels
    tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
    tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

    # Load model
    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir='/content/model_output/xlm-roberta-model',
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir=logging_dir,
        logging_steps=100,
        report_to="none",
        fp16=torch.cuda.is_available(),  # ✅ Only enable fp16 if GPU available
    )

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train
    print("📈 Training the model...")
    trainer.train()
    print("✅ Training complete.")

    # Evaluate
    print("📊 Evaluating on validation set...")
    results = trainer.evaluate()
    print(f"✅ Evaluation results:\n{results}")

    # Save final model and tokenizer
    trainer.save_model('/content/model_output/xlm-roberta-model')
    tokenizer.save_pretrained('/content/model_output/xlm-roberta-model')
    print(f"The model saved to: {model_output}")

        # Save the model
    model.save_pretrained(os.path.join(model_output, "final"))
    tokenizer.save_pretrained(os.path.join(model_output, "final"))
    print(f"The final model saved to: {model_output}")

In [21]:
if __name__ == "__main__":
    fine_tune_model()

🚀 Starting fine-tuning process...
Train dataset size: 163
Eval dataset size: 41


Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


📈 Training the model...


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.817926,0.0,0.0,0.0,0.807897
2,No log,0.714273,0.0,0.0,0.0,0.807897
3,No log,0.624706,0.0,0.0,0.0,0.807897
4,No log,0.555836,0.035971,0.076923,0.023474,0.822981
5,0.836000,0.541711,0.067114,0.117647,0.046948,0.827418


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


✅ Training complete.
📊 Evaluating on validation set...


✅ Evaluation results:
{'eval_loss': 0.5417112708091736, 'eval_f1': 0.06711409395973154, 'eval_precision': 0.11764705882352941, 'eval_recall': 0.046948356807511735, 'eval_accuracy': 0.8274179236912156, 'eval_runtime': 0.2899, 'eval_samples_per_second': 141.447, 'eval_steps_per_second': 20.7, 'epoch': 5.0}
The model saved to: /content/drive/MyDrive/week4/model/xlm-roberta-model
The final model saved to: /content/drive/MyDrive/week4/model/xlm-roberta-model
