# Fine-tuning

In [2]:
!pip install transformers datasets torch




# Load the dataset.

In [3]:
import pandas as pd

# Path to the dataset in the current directory
file_path = '/content/cleaned_labeled_ner_data.conll'
# Function to load the data
def load_conll_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')

    sentences = []
    labels = []
    for sentence in data:
        lines = sentence.split('\n')
        sentence_tokens = []
        sentence_labels = []
        for line in lines:
            # Ensure each line has a token and a label separated by a tab
            parts = line.split('\t')
            if len(parts) == 2:
                token, label = parts
                sentence_tokens.append(token)
                sentence_labels.append(label)
            else:
                print(f"Skipping line: {line}")  # Print problematic lines for debugging
        sentences.append(sentence_tokens)
        labels.append(sentence_labels)

    return sentences, labels

sentences, labels = load_conll_data(file_path)




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping line: ውጪ O
Skipping line: በሌሎች O
Skipping line: ዲዛይንም O
Skipping line: አለው O
Skipping line: የቱርክ O
Skipping line: ስሪት O
Skipping line: 2000ብር O
Skipping line: 0905707448 O
Skipping line: 0909003864 O
Skipping line: ማስታወሻ O
Skipping line: ጃኬቱ O
Skipping line: እራሱ O
Skipping line: ሆኖ O
Skipping line: በህፃናት O
Skipping line: ሳይዝ B-Product
Skipping line: ነው O
Skipping line: ቪዲዮውን O
Skipping line: የተጠቀምነው O
Skipping line: ጃኬቱ O
Skipping line: በሁለቱም O
Skipping line: በኩል B-LOC
Skipping line: እንዴት O
Skipping line: እንደሚለበስ O
Skipping line: ለማሳየት O
Skipping line: ነው O
Skipping line: 0905707448 O
Skipping line: 0909003864 O
Skipping line: ቪዲዮውን O
Skipping line: ከስር O
Skipping line: ይመልከቱ O
Skipping line: 0905707448 O
Skipping line: 0909003864 O
Skipping line: ከ32-37ቁጥር O
Skipping line: 3000ብር O
Skipping line: 0905707448 O
Skipping line: 0909003864 O
Skipping line: ከ O
Skipping line: 33-38 O
Skipping line: ቁጥር B-LOC
Skipping 

# Tokenization
## We will use a pre-trained model’s tokenizer to convert text into input tokens (IDs) that the model can process.

In [4]:
from transformers import AutoTokenizer

# Choose a pre-trained model (XLM-Roberta or AfroXLMR)
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [6]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, is_split_into_words=True, return_tensors="pt")

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        aligned_label = []
        for word_id in word_ids:
            if word_id is None:
                aligned_label.append(-100)  # Ignore tokens without a corresponding word
            else:
                aligned_label.append(label[word_id])
        aligned_labels.append(aligned_label)

    tokenized_inputs["labels"] = torch.tensor(aligned_labels)
    return tokenized_inputs

# Tokenize and align
tokenized_data = tokenize_and_align_labels(sentences, labels)

# Set Up Training Arguments



In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    save_strategy="epoch",           # save checkpoint every epoch
    load_best_model_at_end=True,     # load the best model when finished training
    metric_for_best_model="accuracy",  # metric to evaluate the best model
)




In [8]:
print("First 5 Input IDs:", tokenized_data['input_ids'][:5])
print("First 5 Labels:", tokenized_data['labels'][:5])


First 5 Input IDs: tensor([[0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2]])
First 5 Labels: tensor([[-100, -100],
        [-100, -100],
        [-100, -100],
        [-100, -100],
        [-100, -100]])


Fine-Tune the Model

In [9]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from datasets import Dataset

# Load a pre-trained model
num_labels = len(set([label for sublist in labels for label in sublist])) # Make sure this gives the correct number
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Create Dataset object
dataset = Dataset.from_dict(tokenized_data) # Create Dataset

# Define the compute_metrics function to evaluate the model
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels = labels
    accuracy = accuracy_score(true_labels, predictions)
    return {"accuracy": accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # You can use a separate validation dataset here
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

# Train & Evaluate

In [11]:
# Evaluate the model
trainer.evaluate()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.38,0.435235,0.808
2,0.3465,0.414658,0.822


Evaluation Results: {'eval_loss': 0.4146576523780823, 'eval_accuracy': 0.822, 'eval_runtime': 190.2968, 'eval_samples_per_second': 5.255, 'eval_steps_per_second': 0.657, 'epoch': 2.0}
