# **LoRAfrica: Scaling LLM Fine Tuning for African History**

## **Baseline Benchmark Results**

In [None]:
# For Colab users uncomment the following lines to install required packages
# !pip install lm_eval  langdetect -q
# !pip install git+https://github.com/felipemaiapolo/tinyBenchmarks

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from lm_eval import evaluator
from joblib import dump
import wandb
import numpy as np

In [None]:
# Login to Weights & Biases
wandb.login()

In [None]:
# Create W&B and run
wandb.init(project="phi4_african_history", name="phi4_baseline_bench_mark")

In [None]:
# Load model and tokenizer
model_id = "microsoft/Phi-4-mini-instruct"
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(model_id)
tokeniser = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Gives list of all available tasks
# ! lm_eval --tasks list

In [None]:
## Benchmarking
# Using tiny versions benchmarks
# MMLU,TruthfulQA

# Configuration
config = {
    "model": "microsoft/Phi-4-mini-instruct",
    "tasks": [
              "tinyTruthfulQA",
              "tinyMMLU"
              ],
    "batch_size": 2,
}

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Run evaluation
results = evaluator.simple_evaluate(
    model="hf",
    model_args=f"pretrained={config['model']},parallelize=True,trust_remote_code=False",
    tasks=config["tasks"],
    device=device,
    batch_size=config["batch_size"],
)

# Save results
dump(results, "evaluation_results.joblib")

In [None]:
# Print results
results['results']

{'tinyMMLU': {'alias': 'tinyMMLU',
  'acc_norm,none': np.float64(0.6836969197750487),
  'acc_norm_stderr,none': 'N/A'},
 'tinyTruthfulQA': {'alias': 'tinyTruthfulQA',
  'acc,none': np.float64(0.49745439417659365),
  'acc_stderr,none': 'N/A'}}

In [None]:
# Clean and flatten the dictionary
summary_metrics = {}

for task, metrics in results['results'].items():
    for metric_name, value in metrics.items():
        # Only log numeric values, skip 'alias' and 'N/A'
        if isinstance(value, (int, float, np.float64)):
            # Create a clean name, e.g., "tinyMMLU/acc_norm"
            clean_name = f"{task}/{metric_name.replace(',none', '')}"
            summary_metrics[clean_name] = float(value)

# Log onto active W&B run
wandb.log(summary_metrics)

# Finish & sync data immediately
wandb.finish()
print("Results successfully synced to W&B!")

0,1
tinyMMLU/acc_norm,▁
tinyTruthfulQA/acc,▁

0,1
tinyMMLU/acc_norm,0.6837
tinyTruthfulQA/acc,0.49745


Results successfully synced to W&B!
