## 1. Building Custom Distribution of LLamaStack

In [1]:
import pprint
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient

In [2]:
def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")

client = create_http_client()

In [3]:
# sanity check for lmeval as a registered provider
providers = client.providers.list()
for provider in providers:
    if provider.api == "eval":
        print(f"Provider: {provider.provider_id}")
        print()

Provider: lmeval



## 2. Run LM-Eval

Start by listing the currently registered benchmarks (this should be empty, for now).

In [4]:
benchmarks = client.benchmarks.list()

print(benchmarks)

[]


Register a new benchmark. We'll register the MMLU benchmark.

In [5]:
client.benchmarks.register(
    benchmark_id="lmeval::arc_easy",
    dataset_id="lmeval::arc_easy",
    scoring_functions=["string"],
    provider_benchmark_id="string",
    provider_id="lmeval"
)

Let's verify the benchmark was properly registered.

In [6]:
benchmarks = client.benchmarks.list()

print(benchmarks)

[Benchmark(dataset_id='lmeval::arc_easy', identifier='lmeval::arc_easy', metadata={}, provider_id='lmeval', provider_resource_id='string', scoring_functions=['string'], type='benchmark')]


We will store the model name in `MODEL`:

In [7]:
MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

We will now run the benchmark.

In [8]:
job = client.eval.run_eval(
    benchmark_id="lmeval::arc_easy",
    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": MODEL,
            "provider_id": "lmeval",
            "sampling_params": {
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 256
            },
        },
        "num_examples": 1000 # Just for testing
    },)

print(f"Starting job '{job.job_id}'")

Starting job 'lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b'


Let's periodically poll the benchmark results:

In [9]:
import time

def get_job_status(job_id, benchmark_id):
    return client.eval.jobs.status(job_id=job_id, benchmark_id=benchmark_id)

while True:
    job = get_job_status(job_id=job.job_id, benchmark_id="lmeval::arc_easy")
    print(job)

    if job.status in ['failed', 'completed']:
        print(f"Job ended with status: {job.status}")
        break

    time.sleep(20)

Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='scheduled')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='in_progress')
Job(job_id='lmeval-job-1a77bd75-2ef3-4ef6-9778-6c7af4a9b08b', status='complete

## 3. Get Job Results

Let's get this job's scores

In [10]:
pprint.pprint(client.eval.jobs.retrieve(job_id=job.job_id, benchmark_id="lmeval::arc_easy").scores)


{'arc_easy:acc': ScoringResult(aggregated_results={'acc': 0.259}, score_rows=[{'score': 0.259}]),
 'arc_easy:acc_norm': ScoringResult(aggregated_results={'acc_norm': 0.256}, score_rows=[{'score': 0.256}]),
 'arc_easy:acc_norm_stderr': ScoringResult(aggregated_results={'acc_norm_stderr': 0.013807775152234194}, score_rows=[{'score': 0.013807775152234194}]),
 'arc_easy:acc_stderr': ScoringResult(aggregated_results={'acc_stderr': 0.01386041525752791}, score_rows=[{'score': 0.01386041525752791}])}


Optionally, we can now delete the evaluation job

In [11]:
# clean up
client.eval.jobs.cancel(job_id=job.job_id, benchmark_id="lmeval::arc_easy")

## dk-bench

Create the `dk-bench` benchmark.
This will use the same model for the judge, for simplicity purposes.
Any other model could be passed using `JUDGE_MODEL_URL` and `JUDGE_MODEL_NAME`, either by passing the service directly or by using a Llama Stack OpenAI inference endpoint.

The `dk-bench` task is defined at the `https://github.com/trustyai-explainability/lm-eval-tasks.git` repository, under the `tasks` directory.

We also assume a PVC (in this case named `my-pvc`) exists in the same namespace and it contains the necessary datasets under the directory `upload-files`. 

In [12]:
client.benchmarks.register(
    benchmark_id="lmeval::dk-bench",
    dataset_id="lmeval::dk-bench",
    scoring_functions=["string"],
    provider_benchmark_id="string",
    provider_id="lmeval",
    metadata={
        "custom_task": {
            "git": {
                "url": "https://github.com/trustyai-explainability/lm-eval-tasks.git",
                "branch": "main",
                "commit": "2ff52c6560b14fbf3ec141b1357b076e80b8f25a",
                "path": "tasks/",
            }
        },
        "env": {
            "DK_BENCH_DATASET_PATH": "/opt/app-root/src/hf_home/upload-files/example-dk-bench-input-bmo.jsonl",
            "JUDGE_MODEL_URL": "http://vllm-server:8000/v1/chat/completions",
            "JUDGE_MODEL_NAME": MODEL,
            "JUDGE_API_KEY": "",
        },
        "input": {"storage": {"pvc": "my-pvc"}}
    },
)

List the available benchmarks:

In [13]:
benchmarks = client.benchmarks.list()

print(benchmarks)

[Benchmark(dataset_id='lmeval::arc_easy', identifier='lmeval::arc_easy', metadata={}, provider_id='lmeval', provider_resource_id='string', scoring_functions=['string'], type='benchmark'), Benchmark(dataset_id='lmeval::dk-bench', identifier='lmeval::dk-bench', metadata={'custom_task': {'git': {'url': 'https://github.com/trustyai-explainability/lm-eval-tasks.git', 'branch': 'main', 'commit': '2ff52c6560b14fbf3ec141b1357b076e80b8f25a', 'path': 'tasks/'}}, 'env': {'DK_BENCH_DATASET_PATH': '/opt/app-root/src/hf_home/upload-files/example-dk-bench-input-bmo.jsonl', 'JUDGE_MODEL_URL': 'http://vllm-server:8000/v1/chat/completions', 'JUDGE_MODEL_NAME': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'JUDGE_API_KEY': ''}, 'input': {'storage': {'pvc': 'my-pvc'}}}, provider_id='lmeval', provider_resource_id='string', scoring_functions=['string'], type='benchmark')]


In [14]:
job = client.eval.run_eval(
    benchmark_id="lmeval::dk-bench",
    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": MODEL,
            "provider_id": "lmeval",
            "sampling_params": {"temperature": 0.7, "top_p": 0.9, "max_tokens": 256},
        },
        "num_examples": 1000,  # Just for testing
    },
)

print(f"Starting job '{job.job_id}'")

Starting job 'lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025'


In [15]:
while True:
    job = get_job_status(job_id=job.job_id, benchmark_id="lmeval::dk-bench")
    print(job)

    if job.status in ['failed', 'completed']:
        print(f"Job ended with status: {job.status}")
        break

    time.sleep(20)

Job(job_id='lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025', status='scheduled')
Job(job_id='lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025', status='in_progress')
Job(job_id='lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025', status='in_progress')
Job(job_id='lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025', status='in_progress')
Job(job_id='lmeval-job-519f7493-874a-4da3-b38c-e19bd58c0025', status='failed')
Job ended with status: failed
