In [None]:
# For Colab users uncomment the following lines to install required packages
# !pip install lm_eval  langdetect -q
# !pip install git+https://github.com/felipemaiapolo/tinyBenchmarks

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from lm_eval import evaluator
from joblib import dump
import wandb
import numpy as np

In [None]:
# Login to Weights & Biases
wandb.login()

In [None]:
# Create W&B and run
wandb.init(project="phi4_african_history", name="phi4_lora_bench_mark")

In [None]:
# Define your IDs
base_model_id = "microsoft/Phi-4-mini-instruct"
lora_adapter_id = "DannyAI/phi4_african_history_lora" # Your HF path

In [None]:
config = {
    # "model": lora_model,
    "tasks": [
              "tinyTruthfulQA",
              "tinyMMLU"
              ],
    "batch_size": 4,
}

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

results = evaluator.simple_evaluate(
    model="hf",
    model_args=f"pretrained={base_model_id},peft={lora_adapter_id},parallelize=True,trust_remote_code=False",
    tasks=config["tasks"],
    device=device,
    batch_size=config["batch_size"],
)


dump(results, "evaluation_results.joblib")

In [7]:
results['results']

{'tinyMMLU': {'alias': 'tinyMMLU',
  'acc_norm,none': np.float64(0.6775072308852788),
  'acc_norm_stderr,none': 'N/A'},
 'tinyTruthfulQA': {'alias': 'tinyTruthfulQA',
  'acc,none': np.float64(0.43554595738528545),
  'acc_stderr,none': 'N/A'}}

In [None]:
# Clean and flatten the dictionary
summary_metrics = {}

for task, metrics in results['results'].items():
    for metric_name, value in metrics.items():
        # Only log numeric values, skip 'alias' and 'N/A'
        if isinstance(value, (int, float, np.float64)):
            # Create a clean name, e.g., "tinyMMLU/acc_norm"
            clean_name = f"{task}/{metric_name.replace(',none', '')}"
            summary_metrics[clean_name] = float(value)

# Log onto active W&B run
wandb.log(summary_metrics)

# Finish & sync data immediately
wandb.finish()
print("Results successfully synced to W&B!")

0,1
tinyMMLU/acc_norm,▁
tinyTruthfulQA/acc,▁

0,1
tinyMMLU/acc_norm,0.67751
tinyTruthfulQA/acc,0.43555


Results successfully synced to W&B!
