# **LoRAfrica: Scaling LLM Fine Tuning for African History**

## **Fine-tuning with LoRA Adaption**

In [None]:
import os
import torch
import numpy as np
import wandb
from datasets import load_dataset, Dataset
from transformers import pipeline
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, PeftModel
from bert_score import score as bert_score
from huggingface_hub import notebook_login

In [None]:
# Load the African History QA Dataset
full_dataset = load_dataset("DannyAI/African-History-QA-Dataset")
full_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2114
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 200
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 100
    })
})

### **Account Sign in & setup**

In [None]:
# Log in to Hugging Face Hub
notebook_login()

In [None]:
# Log in to Weights & Biases
wandb.login()

True

In [None]:
# Initialize a new W&B run
wandb.init(project="phi4_african_history", name="phi4_african_history_lora")

### **Model Setup & Dataset**

In [None]:
# Load train, validation and test data
train_data = full_dataset["train"]
val_data = full_dataset["validation"]
test_data = full_dataset["test"]

In [None]:
# Define model_id and output_dir
model_id = "microsoft/Phi-4-mini-instruct"
output_dir = "./phi4_african_history_lora"

In [None]:
tokeniser = AutoTokenizer.from_pretrained(model_id)
tokeniser.pad_token = tokeniser.eos_token

# load model
model  = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    torch_dtype = torch.bfloat16,
    trust_remote_code = False
)

# lora settings
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout = 0.05, # dataset is small, hence a low dropout value
    bias = "none",
    task_type="CAUSAL_LM"
)

# Lora model
model = get_peft_model(model,lora_config)

In [13]:
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 3,837,594,624 || trainable%: 0.0410


### **Masked tokenisation and Collator**

In [None]:
def tokenisation(example)->dict:
    """
    Tokenizes the input example by applying the chat template.
    """
    messages = [
        {
            "role": "system",
            "content":"You are a helpful AI assistant specialised in African history which gives concise answers to questions asked"
        },
        {
            "role":"user",
            "content":example["question"]
        },
        {
            "role":"assistant",
            "content":example["answer"]
        }
    ]
    full_text = tokeniser.apply_chat_template(messages,tokenize=False)
    return tokeniser(full_text, truncation=True, 
                     max_length=2048, 
                     add_special_tokens=False
                    )

In [None]:
# Tokenize datasets
train_dataset = train_data.map(tokenisation)
val_dataset = val_data.map(tokenisation)

In [17]:
train_dataset

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 2114
})

In [None]:
# Custom data collator to mask inputs before assistant response
class AssistantMaskingCollator():
    def __init__(self,tokeniser):
        self.tokeniser = tokeniser
        self.assistant_header = tokeniser.encode("<|assistant|>\n", add_special_tokens=False)

    def __call__(self,features):
        input_ids = [torch.tensor(f["input_ids"]) for f in features]
        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids,batch_first=True,padding_value=self.tokeniser.pad_token_id)

        labels = input_ids.clone()

        for i in range(len(labels)):
            # Find where assistant response starts
            found = False
            for j in range(len(input_ids[i]) - len(self.assistant_header) + 1):
                if input_ids[i][j : j + len(self.assistant_header)].tolist() == self.assistant_header:
                    # Mask everything before and including the header
                    labels[i, : j + len(self.assistant_header)] = -100
                    found = True
                    break
            
            # If for some reason header isn't found, mask everything to be safe
            if not found:
                labels[i, :] = -100
            
            # Mask actual padding tokens
            labels[i][input_ids[i] == self.tokeniser.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": input_ids.ne(self.tokeniser.pad_token_id).long(),
            "labels": labels
        }

In [19]:
data_collator = AssistantMaskingCollator(tokeniser)

### **Model Training**

In [None]:
# Training arguments
# STF can also be used here
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps = 4,
    learning_rate=2e-5,
    num_train_epochs=10,
    bf16=True,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=200,
    logging_steps=10,
    report_to="wandb",
    remove_unused_columns=False
)

In [None]:
# Initialize Trainer with training arguments, datasets, model, and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

In [None]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
100,1.6439,1.65012
200,1.5483,1.577856
300,1.581,1.551598
400,1.5789,1.538108
500,1.4988,1.528269
600,1.4013,1.518312
700,1.52,1.513678
800,1.4364,1.506603
900,1.5456,1.504393
1000,1.4398,1.502365


TrainOutput(global_step=2650, training_loss=1.4904400303678693, metrics={'train_runtime': 1365.2341, 'train_samples_per_second': 15.485, 'train_steps_per_second': 1.941, 'total_flos': 3.851040433792205e+16, 'train_loss': 1.4904400303678693, 'epoch': 10.0})

In [None]:
# Save the fine-tuned model
trainer.save_model(output_dir)

In [None]:
# Print a message indicating the model has been saved
print(f"Model saved to {output_dir}")

Model saved to ./phi4_african_history_lora


### **Pushing to Huggingface Hub**

In [None]:
HF_TOKEN="token_here"  # replace with your Hugging Face token

In [None]:
# Load the fine-tuned model for inference
lora_model = PeftModel.from_pretrained(
    model,output_dir
)

# Push to Huggingface Hub
username = "DannyAI" # replace with your Hugging Face username
hf_project_name= "phi4_african_history_lora" # replace with your desired repo name
repo_id = f"{username}/{hf_project_name}"
lora_model.push_to_hub(repo_id,exist_ok=True,token=HF_TOKEN)
tokeniser = AutoTokenizer.from_pretrained(model_id)
tokeniser.push_to_hub(repo_id,exist_ok=True,token=HF_TOKEN)

### **Evaluation**

In [None]:
# Clear memory for inference
del model
torch.cuda.empty_cache()

In [None]:
lora_model.eval()

In [None]:
# The pipeline handles chat templates and decoding automatically
generator = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokeniser,
)

def generate_answer(question)->str:
    """Generates an answer for the given question using the fine-tuned LoRA model.
    """
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant specialised in African history which gives concise answers to questions asked."},
        {"role": "user", "content": question}
    ]
    
    # pipeline() returns a list of dicts; return_full_text=False gives only the assistant's reply
    output = generator(
        messages, 
        max_new_tokens=2048, 
        temperature=0.1, 
        do_sample=False,
        return_full_text=False
    )
    return output[0]['generated_text'].strip()

# Generate predictions on the test set
print("--- Generating Predictions on Test Set ---")
test_predictions = []
# Assuming test_data is a list of dicts with "question" and "answer" keys
test_references = [item["answer"] for item in test_data]

for i, item in enumerate(test_data):
    pred = generate_answer(item["question"])
    test_predictions.append(pred)
    
    if i < 2: # Sample output for verification
        print(f"\nSample Q: {item['question']}")
        print(f"Sample A (Lora Model): {pred}")
        print(f"Sample A (Ref): {item['answer']}\n")

# Metrics Calculation using BERTScore
print("--- Calculating BERTScore ---")
# P = Precision, R = Recall, F1 = F1 Score
P, R, F1 = bert_score(test_predictions, test_references, lang="en", verbose=True)

avg_f1 = F1.mean().item()
print(f"\nFinal Evaluation Results:")
print(f"Average BERTScore F1: {avg_f1:.4f}")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


--- Generating Predictions on Test Set ---

Sample Q: How did European traders impact the textile industry in the Kingdom of Kongo?
Sample A (Lora Model): European traders introduced new textile materials and techniques, which were adopted and adapted by local weavers.
Sample A (Ref): European traders, particularly the Portuguese, played a significant role in the textile industry in the Kingdom of Kongo, with the Portuguese trading raffia cloth and other textiles with the kingdom and re-exporting them to other regions.


Sample Q: What is the significance of African feminist scholarly activism in contemporary resistance movements?
Sample A (Lora Model): African feminist scholarly activism is significant in contemporary resistance movements as it provides a critical framework for understanding and addressing the specific challenges faced by African women in the context of global capitalism, neocolonialism, and patriarchal structures.
Sample A (Ref): Contemporary African feminist scholar

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


--- Calculating BERTScore ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 0.54 seconds, 183.68 sentences/sec

Final Evaluation Results:
Average BERTScore F1: 0.9075


In [33]:
test_data['question'][8]

'What role do international investments play in African e-commerce development?'

In [34]:
test_data['answer'][8]

"International investments, such as Amazon's entry into South Africa and various venture capital investments in platforms like Jumia, are crucial for the development of African e-commerce, bringing technology, expertise, and capital to the market."

In [35]:
test_predictions[8]

'International investments are crucial for African e-commerce development, providing necessary capital, technology, and expertise to scale operations and expand market reach.'

In [48]:
# Finalize W&B
wandb.log({"Final_Test_BERTScore": avg_f1})
wandb.finish()

0,1
Final_Test_BERTScore,▁
eval/loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▃▅▅▄▂▁▆▆▃▃▄▃▃▆▇█▃▄▃▄▄▃▃▅▅
eval/samples_per_second,█▆▄▄▅▇█▃▃▆▆▅▆▆▃▂▁▆▄▆▅▅▆▆▄▄
eval/steps_per_second,█▆▄▄▄▇█▃▃▆▆▅▆▆▃▂▁▆▄▆▅▅▆▆▄▄
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,▁▁▂▃▄▃▄▄▄▄▆▅▇▇▆▅▆▅▆▆▅▅▆█▆▇▆█▇▆▆▆▆▆▇▆▆█▆█
train/learning_rate,██▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▆▆▅▄▄▄▄▃▄▄▅▄▃▂▃▃▃▃▃▄▃▂▂▂▂▄▃▂▂▃▂▂▁▂▃▂▃▃▂

0,1
Final_Test_BERTScore,0.90687
eval/loss,1.48851
eval/runtime,2.9442
eval/samples_per_second,67.931
eval/steps_per_second,8.491
total_flos,3.851040433792205e+16
train/epoch,10.0
train/global_step,2650.0
train/grad_norm,8.32305
train/learning_rate,0.0
