## 1. Building Custom Distribution of LLamaStack

In [1]:
import requests

import pprint

from llama_stack.distribution.library_client import LlamaStackAsLibraryClient


In [2]:
def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")

client = create_http_client()

In [3]:
# sanity check for lmeval as a registered provider
providers = client.providers.list()
for provider in providers:
    if provider.api == "eval":
        print(f"Provider: {provider.provider_id}")
        print()

Provider: lmeval



## 2. Run LM-Eval

Start by listing the currently registered benchmarks (this should be empty, for now).

In [4]:
URL = "http://0.0.0.0:8321/v1/eval"

benchmarks = client.benchmarks.list()

print(benchmarks)

[]


Register a new benchmark. We'll register the MMLU benchmark.

In [5]:
client.benchmarks.register(
    benchmark_id="lmeval::arc_easy",
    dataset_id="lmeval::arc_easy",
    scoring_functions=["string"],
    provider_benchmark_id="string",
    provider_id="lmeval"
)

Let's verify the benchmark was properly registered.

In [6]:
benchmarks = client.benchmarks.list()

print(benchmarks)

[Benchmark(dataset_id='lmeval::arc_easy', identifier='lmeval::arc_easy', metadata={}, provider_id='lmeval', provider_resource_id='string', scoring_functions=['string'], type='benchmark')]


We will now run the benchmark.

In [7]:
job = client.eval.run_eval(
    benchmark_id="lmeval::arc_easy",
    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            "provider_id": "lmeval",
            "sampling_params": {
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 256
            },
        },
        "num_examples": 1000 # Just for testing
    },)

print(f"Starting job '{job.job_id}'")

Starting job 'lmeval-job-0'


Let's periodically poll the benchmark results:

In [8]:
import time

def get_job_status():
    return client.eval.jobs.status(job_id=job.job_id, benchmark_id="lmeval::arc_easy")

while True:
    job = get_job_status()
    print(job)

    if job.status in ['failed', 'completed']:
        print(f"Job ended with status: {job.status}")
        break

    time.sleep(20)

Job(job_id='lmeval-job-0', status='scheduled')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='in_progress')
Job(job_id='lmeval-job-0', status='completed')
Job ended with status: completed


## 3. Get Job Results

Let's get this job's scores

In [9]:
pprint.pprint(client.eval.jobs.retrieve(job_id=job.job_id, benchmark_id="lmeval::arc_easy").scores)


{'arc_easy:acc': ScoringResult(aggregated_results={'acc': 0.259}, score_rows=[{'score': 0.259}]),
 'arc_easy:acc_norm': ScoringResult(aggregated_results={'acc_norm': 0.255}, score_rows=[{'score': 0.255}]),
 'arc_easy:acc_norm_stderr': ScoringResult(aggregated_results={'acc_norm_stderr': 0.013790038620872835}, score_rows=[{'score': 0.013790038620872835}]),
 'arc_easy:acc_stderr': ScoringResult(aggregated_results={'acc_stderr': 0.01386041525752791}, score_rows=[{'score': 0.01386041525752791}])}


Optionally, we can now delete the evaluation job

In [10]:
# clean up
client.eval.jobs.cancel(job_id=job.job_id, benchmark_id="lmeval::arc_easy")