## Installing Required Libraries

We are installing essential libraries for our project:

- **datasets**: For loading and preprocessing datasets.
- **evaluate**: For evaluating machine learning models.
- **accelerate**: For speeding up training and inference.
- **transformers[torch]**: For using pre-trained NLP models with PyTorch.

### Installation Command

In [1]:
! pip install --quiet datasets evaluate accelerate transformers[torch]

## Loading the Amharic News Text Classification Dataset and Preprocessing the data.

In [1]:
from datasets import load_dataset
all_dataset = load_dataset('csv', data_files='data/Amharic News Dataset.csv')
all_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link'],
        num_rows: 51483
    })
})

In [2]:
import re
# method to normalize character level missmatch such as ጸሀይ and ፀሐይ
def normalize_char_level_missmatch(input_token):
    rep1 = re.sub('[ሃኅኃሐሓኻ]', 'ሀ', input_token)
    rep2 = re.sub('[ሑኁዅ]', 'ሁ', rep1)
    rep3 = re.sub('[ኂሒኺ]', 'ሂ', rep2)
    rep4 = re.sub('[ኌሔዄ]', 'ሄ', rep3)
    rep5 = re.sub('[ሕኅ]', 'ህ', rep4)
    rep6 = re.sub('[ኆሖኾ]', 'ሆ', rep5)
    rep7 = re.sub('[ሠ]', 'ሰ', rep6)
    rep8 = re.sub('[ሡ]', 'ሱ', rep7)
    rep9 = re.sub('[ሢ]', 'ሲ', rep8)
    rep10 = re.sub('[ሣ]', 'ሳ', rep9)
    rep11 = re.sub('[ሤ]', 'ሴ', rep10)
    rep12 = re.sub('[ሥ]', 'ስ', rep11)
    rep13 = re.sub('[ሦ]', 'ሶ', rep12)
    rep14 = re.sub('[ዓኣዐ]', 'አ', rep13)
    rep15 = re.sub('[ዑ]', 'ኡ', rep14)
    rep16 = re.sub('[ዒ]', 'ኢ', rep15)
    rep17 = re.sub('[ዔ]', 'ኤ', rep16)
    rep18 = re.sub('[ዕ]', 'እ', rep17)
    rep19 = re.sub('[ዖ]', 'ኦ', rep18)
    rep20 = re.sub('[ጸ]', 'ፀ', rep19)
    rep21 = re.sub('[ጹ]', 'ፁ', rep20)
    rep22 = re.sub('[ጺ]', 'ፂ', rep21)
    rep23 = re.sub('[ጻ]', 'ፃ', rep22)
    rep24 = re.sub('[ጼ]', 'ፄ', rep23)
    rep25 = re.sub('[ጽ]', 'ፅ', rep24)
    rep26 = re.sub('[ጾ]', 'ፆ', rep25)
    # Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል
    rep27 = re.sub('(ሉ[ዋአ])', 'ሏ', rep26)
    rep28 = re.sub('(ሙ[ዋአ])', 'ሟ', rep27)
    rep29 = re.sub('(ቱ[ዋአ])', 'ቷ', rep28)
    rep30 = re.sub('(ሩ[ዋአ])', 'ሯ', rep29)
    rep31 = re.sub('(ሱ[ዋአ])', 'ሷ', rep30)
    rep32 = re.sub('(ሹ[ዋአ])', 'ሿ', rep31)
    rep33 = re.sub('(ቁ[ዋአ])', 'ቋ', rep32)
    rep34 = re.sub('(ቡ[ዋአ])', 'ቧ', rep33)
    rep35 = re.sub('(ቹ[ዋአ])', 'ቿ', rep34)
    rep36 = re.sub('(ሁ[ዋአ])', 'ኋ', rep35)
    rep37 = re.sub('(ኑ[ዋአ])', 'ኗ', rep36)
    rep38 = re.sub('(ኙ[ዋአ])', 'ኟ', rep37)
    rep39 = re.sub('(ኩ[ዋአ])', 'ኳ', rep38)
    rep40 = re.sub('(ዙ[ዋአ])', 'ዟ', rep39)
    rep41 = re.sub('(ጉ[ዋአ])', 'ጓ', rep40)
    rep42 = re.sub('(ደ[ዋአ])', 'ዷ', rep41)
    rep43 = re.sub('(ጡ[ዋአ])', 'ጧ', rep42)
    rep44 = re.sub('(ጩ[ዋአ])', 'ጯ', rep43)
    rep45 = re.sub('(ጹ[ዋአ])', 'ጿ', rep44)
    rep46 = re.sub('(ፉ[ዋአ])', 'ፏ', rep45)
    rep47 = re.sub('[ቊ]', 'ቁ', rep46)  # ቁ can be written as ቊ
    rep48 = re.sub('[ኵ]', 'ኩ', rep47)  # ኩ can be also written as ኵ
    return rep48

# Splitting the dataset into train and test
dataset = all_dataset['train'].train_test_split(test_size=0.2, seed=42) 
# Filtering the dataset to remove the rows that have None in the category column
dataset = dataset.filter(lambda x: x['category'] is not None)
# Normalizing the category column
dataset = dataset.map(lambda x: {'category': normalize_char_level_missmatch(x['category'])})
dataset

Map: 100%|██████████| 41185/41185 [00:07<00:00, 5692.72 examples/s]
Map: 100%|██████████| 10297/10297 [00:01<00:00, 6513.54 examples/s]


DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link'],
        num_rows: 41185
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link'],
        num_rows: 10297
    })
})

In [4]:
# Getting the unique categories that will be used as labels for the model.
categories = list(sorted(set(dataset['train']['category'])))
categories

['ሀገር አቀፍ ዜና', 'መዝናኛ', 'ስፖርት', 'ቢዝነስ', 'አለም አቀፍ ዜና', 'ፖለቲካ']

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding

model_checkpoint = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



# Tokenize the dataset
def tokenize_function(item):
    return tokenizer(item['article'], truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Add numerical labels to the tokenized dataset
def add_labels(example):
    example['labels'] = [categories.index(item) for item in example['category']]
    return example
tokenized_dataset = tokenized_dataset.map(add_labels, batched= True)



print(dataset)
print(tokenized_dataset) # Tokenizer added two fields: input_ids and attention_mask


# Use a data collator to apply dynamic padding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer, return_tensors='pt')

DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link'],
        num_rows: 41185
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link'],
        num_rows: 10297
    })
})
DatasetDict({
    train: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 41185
    })
    test: Dataset({
        features: ['headline', 'category', 'date', 'views', 'article', 'link', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10297
    })
})


In [6]:
# for item in tokenized_dataset['train']:
#     if item['category'] != categories[item['labels']]:
#         print('Mismatch:', item['categories'], categories[item['labels']])

In [7]:

for y in tokenized_dataset['train']:
    original_article = y['article']
    tokenized_input_ids = y['input_ids']
    
    decoded_article = tokenizer.decode(
        tokenized_input_ids, skip_special_tokens=True)

    print("Original Article:")
    print(original_article)
    print("Tokenized Input IDs:")
    print(tokenized_input_ids)
    print("Decoded Article")
    print(decoded_article)
    print("Length of Orginal Article: ", len(original_article))
    print("Length of Decoded Article: ", len(decoded_article))
    print("Length of Input Ids: ", len(tokenized_input_ids))
    break

Original Article:
በአዲሱ የትምህርት ዘመን በመቶ ሺዎች የሚቆጠሩ የዩኒቨርሲቲ ተማሪዎች ወደ ተመደቡባቸው ተቋማት ለመጓጓዝ ችግር ተፈጠረባቸው፡፡ ወደተለያዩ ሥፍራዎች በዓላትን ለማክበር በሚጓዙ ሰዎች ምክንያት የትራንስፖርት እጥረትና መጉላላት ሊፈጠር እንደሚችል ሥጋት መኖሩ ተገለጸ፡፡አብዛኛዎቹ የከፍተኛ ትምህርት ተቋማት በመቶ ሺዎች የሚቆጠሩ ተማሪዎቻቸውን በተመሳሳይ ጊዜ ለምዝገባ መጥራታቸው፣ ለመስቀል በዓል ከሚጓጓዙና የአረፋን በዓል በየክልሎች አክብረው ከሚመለሱ ዜጎች ጋር ተደምሮ ችግሩ ሊባባስ እንደሚችል፣ የፌዴራል ትራንስፖርት ባለሥልጣን ሥጋቱን ገልጿል፡፡ሪፖርተር ያነጋገራቸው የባለሥልጣኑ አገር አቋራጭ የሕዝብ ትራንስፖርት ስምሪት ክትትል ቡድን መሪ አቶ እንደሻው ጎሹ ግን፣ እስከ መስከረም 14 ቀን 2008 ዓ.ም. ከቀትር በኋላ ድረስ ምንም እጥረት እንዳልተፈጠረ፣ ቀደም ብሎ በነበረው ዝግጅት መሠረት አገልግሎቱ ተቀናጅቶ እየተሰጠ እንደነበር ገልጸዋል፡፡‹‹ነገር ግን በቀጣዮቹ ቀናት ሊፈጠር ከሚችለው እጥረት አኳያ ከፍተኛ ሥጋት አለብን፤›› ብለዋል፡፡ ለዚህ ሥጋት መነሻ ነው ያሉት ከመስከረም 16 ቀን 2008 ዓ.ም. ጀምሮ ባሉት ቀናት ከአሥራ ዘጠኝ በላይ ዩኒቨርሲቲዎች አዲስ ተማሪዎቻቸውን በተቀራራቢ ቀናት በየተቋማቱ እንዲገኙ መጥራታቸው ነው፡፡ በዚህም ምክንያት ከፍተኛ ጫና መፈጠሩ እንደማይቀር አስረድተዋል፡፡ ችግሩ የሚባባስ ከሆነ ለተማሪዎቹ ቅድሚያ በመስጠት በመደበኛ የትራንስፖርት ተጠቃሚዎች ላይ የተወሰነ ጫና ሊፈጠር እንደሚችልም አቶ እንዳሸው ገልጸዋል፡፡ የመስቀል በዓልን ተከትሎም ሆነ ከአረፋ በዓል በኋላ በርካታ ሰዎች ከወትሮው በተለየ ሁኔታ የሚንቀሳቀሱ ቢሆንም፣ በመናኸሪያዎች አካባቢ የተቀላጠፈ አገልግሎት ለመስጠት እየተሞከረ እንደሚገኝ ተ

In [8]:
# Load the model

import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments

# We are using the XLM ROberat. We are configuring it for text classification task

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(categories),
    id2label={i: lbl for i, lbl in enumerate(categories)},
    label2id={lbl: i for i, lbl in enumerate(categories)},
    device_map="cuda"
)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model.to(device)

# Print the device to confirm
print(f"Model loaded on: {device}")

# Seting up the fine tunning parameter
batch_size = 16
gradient_accumulation_steps = 4
epochs = 5

training_args = TrainingArguments(
    output_dir=model_checkpoint+"-finetuned",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=epochs,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=42,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda




In [9]:
import evaluate
import numpy as np

# Defining our compute metrics

def compute_metrics(eval_preds):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(
      predictions=predictions, references=labels)["accuracy"]
  precision = metric2.compute(
      predictions=predictions, references=labels, average='macro')["precision"]
  recall = metric3.compute(predictions=predictions,
                           references=labels, average='macro')["recall"]
  f1 = metric4.compute(predictions=predictions,
                       references=labels, average='macro')["f1"]

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }

compute_metrics(([[1, 0], [0,1]], [0,1]))

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=fine_tuned_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train(resume_from_checkpoint=True)

  trainer = Trainer(
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
 80%|████████  | 2572/3215 [36:47<35:43,  3.33s/it]  

{'loss': 0.2085, 'grad_norm': 26.816082000732422, 'learning_rate': 1.0186625194401244e-05, 'epoch': 4.0}


                                                   
 80%|████████  | 2572/3215 [38:10<35:43,  3.33s/it]

{'eval_loss': 0.3304812014102936, 'eval_accuracy': 0.8866660192289016, 'eval_precision': 0.8669120499120396, 'eval_recall': 0.8404847295200484, 'eval_f1': 0.8528491180034422, 'eval_runtime': 83.3282, 'eval_samples_per_second': 123.572, 'eval_steps_per_second': 7.728, 'epoch': 4.0}


100%|██████████| 3215/3215 [1:14:12<00:00,  3.36s/it]

{'loss': 0.157, 'grad_norm': 17.003381729125977, 'learning_rate': 1.8662519440124417e-07, 'epoch': 5.0}


Using the latest cached version of the module from C:\Users\fikre\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Fri Jan 17 14:38:44 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fikre\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 17 14:38:46 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fikre\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 17 14:38:48 2025) since it couldn't be found locally at

{'eval_loss': 0.33792978525161743, 'eval_accuracy': 0.8919102651257648, 'eval_precision': 0.8667374707045448, 'eval_recall': 0.855662905348919, 'eval_f1': 0.8600085298380371, 'eval_runtime': 79.9576, 'eval_samples_per_second': 128.781, 'eval_steps_per_second': 8.054, 'epoch': 5.0}


100%|██████████| 3215/3215 [1:15:39<00:00,  1.41s/it]

{'train_runtime': 4539.3781, 'train_samples_per_second': 45.364, 'train_steps_per_second': 0.708, 'train_loss': 0.07309300969955895, 'epoch': 5.0}





TrainOutput(global_step=3215, training_loss=0.07309300969955895, metrics={'train_runtime': 4539.3781, 'train_samples_per_second': 45.364, 'train_steps_per_second': 0.708, 'total_flos': 5.415114548373984e+16, 'train_loss': 0.07309300969955895, 'epoch': 4.9984466019417475})

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

fine_tuned_checkpoint = "FacebookAI/xlm-roberta-base-finetuned/checkpoint-3215"
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_checkpoint)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(
    fine_tuned_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [2]:
input_paragraph = """
የማንቸስተር ዩናይትዱ አሰልጣኝ ሩበን አሞሪም ቡድናቸው በክለቡ 147 ዓመታት ታሪክ "በጣም ደካማው" ሳይሆን እንደማይቀር ገለጸ።

ቡድኑ በብራይተን 3 ለ 1 የተሸነፈበት ጨዋታ በሜዳው ካደረጋቸው ያለፉት አምስት የፕሪሚር ሊግ መርሐ ግብሮች በአራቱ የተሸነፈበት ሆኗል።

አሞሪም ኤሪክ ቴን ሃግን ተከቶ ቡድኑን ከተረከበ በኋላ ፖርቹጋላዊው ካለፉት 11 ጨዋታዎች 11 ነጥብ ብቻ ሰብስቧል።

ዩናይትድ ከመውረድ ቀጠና በ10 ነጥብ ርቆ 13ኛ ደረጃ ላይ ተቀምጧል።

"ካለፉት 10 የሊጉ ጨዋታዎች ሁለቱን ነው ያሸነፍነው። ይህ ለደጋፊዎቹም ሆነ ለእኔ ምን ማለት እንደሆነ አስቡት" ብሏል።

"አዲሱ አሰልጣኝ ካለፈው አሰልጣኝ በበለጠ እየተሸነፈ ነው። ይህንን አውቃለሁ።"

"በማንቸስተር ዩናይትድ ታሪክ ደካማው ቡድን ሳንሆን አንቀርም። እናንተ [መገናኛ ብዙሃን] ርዕስ እንደምትፈልጉ ባውቅም እውነቱን ተናግሬ መቀየር ይኖርብኛል። ይኸው ርዕስ ሰጠዋችሁ" ሲል ተናግሯል።

በብራይተን ጎል ካስተናገደ በኋላ ዩናይትድ አቻ ለመሆን በቅቶ ነበር።

የብሩኖ ፈርናንዴስ ፍጹም ቅጣት ምት ቡድኑን ያነቃቃዋል ቢባልም ይበልጥ ተበልጦ ታይቷል።

ካኦሩ ሚቶማ ቡድኑን ቀዳሚ ሲያደርግ ግብ ጠባቂው አንድሬ ኦናና የፈጸመውን ስህተት ተጠቅሞ ጂዮርጂኖ ረተር ለቡድኑ ሦስተኛዋን ጎል ከመረብ አሳርፏል።
"""
# Tokenize the input paragraph
inputs = tokenizer(input_paragraph, padding=True,
                   truncation=True, return_tensors="pt", max_length=512)

inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions
fine_tuned_model.eval()
with torch.no_grad():
    outputs = fine_tuned_model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

# Map predictions to labels
id2label = fine_tuned_model.config.id2label
predicted_label = id2label[predictions.item()]

# Print the prediction
print(f"Input Paragraph: {input_paragraph}")
print(f"Predicted Label: {predicted_label}")

Input Paragraph: 
የማንቸስተር ዩናይትዱ አሰልጣኝ ሩበን አሞሪም ቡድናቸው በክለቡ 147 ዓመታት ታሪክ "በጣም ደካማው" ሳይሆን እንደማይቀር ገለጸ።

ቡድኑ በብራይተን 3 ለ 1 የተሸነፈበት ጨዋታ በሜዳው ካደረጋቸው ያለፉት አምስት የፕሪሚር ሊግ መርሐ ግብሮች በአራቱ የተሸነፈበት ሆኗል።

አሞሪም ኤሪክ ቴን ሃግን ተከቶ ቡድኑን ከተረከበ በኋላ ፖርቹጋላዊው ካለፉት 11 ጨዋታዎች 11 ነጥብ ብቻ ሰብስቧል።

ዩናይትድ ከመውረድ ቀጠና በ10 ነጥብ ርቆ 13ኛ ደረጃ ላይ ተቀምጧል።

"ካለፉት 10 የሊጉ ጨዋታዎች ሁለቱን ነው ያሸነፍነው። ይህ ለደጋፊዎቹም ሆነ ለእኔ ምን ማለት እንደሆነ አስቡት" ብሏል።

"አዲሱ አሰልጣኝ ካለፈው አሰልጣኝ በበለጠ እየተሸነፈ ነው። ይህንን አውቃለሁ።"

"በማንቸስተር ዩናይትድ ታሪክ ደካማው ቡድን ሳንሆን አንቀርም። እናንተ [መገናኛ ብዙሃን] ርዕስ እንደምትፈልጉ ባውቅም እውነቱን ተናግሬ መቀየር ይኖርብኛል። ይኸው ርዕስ ሰጠዋችሁ" ሲል ተናግሯል።

በብራይተን ጎል ካስተናገደ በኋላ ዩናይትድ አቻ ለመሆን በቅቶ ነበር።

የብሩኖ ፈርናንዴስ ፍጹም ቅጣት ምት ቡድኑን ያነቃቃዋል ቢባልም ይበልጥ ተበልጦ ታይቷል።

ካኦሩ ሚቶማ ቡድኑን ቀዳሚ ሲያደርግ ግብ ጠባቂው አንድሬ ኦናና የፈጸመውን ስህተት ተጠቅሞ ጂዮርጂኖ ረተር ለቡድኑ ሦስተኛዋን ጎል ከመረብ አሳርፏል።

Predicted Label: ስፖርት
