## 1. Building Custom Distribution of LLamaStack

In [134]:
import os
import subprocess
import logging
import requests

import pprint

from llama_stack.distribution.library_client import LlamaStackAsLibraryClient


In [135]:
# llama stack build --image-type venv --config llama_stack/templates/lm-eval/lm-eval-build.yaml
client = LlamaStackAsLibraryClient(".stack-run.yaml")
_ = client.initialize()



In [136]:
def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")

client = create_http_client()

In [137]:
# sanity check for lmeval as a registered provider
providers = client.providers.list()
for provider in providers:
    if provider.api == "eval":
        print(f"Provider: {provider.provider_id}")
        print()

Provider: meta-reference-0

Provider: lmeval-1



## 2. Run LM-Eval

In [138]:
url = "http://0.0.0.0:8321/v1/eval"

response = requests.get(f"{url}/benchmarks")

if response.status_code == 200:
   pprint.pprint(response.json())

{'data': [{'dataset_id': 'lmeval::mmlu',
           'identifier': 'lmeval::mmlu',
           'metadata': {},
           'provider_id': 'lmeval-1',
           'provider_resource_id': 'string',
           'scoring_functions': ['string'],
           'type': 'benchmark'},
          {'dataset_id': 'lmeval:mmlu',
           'identifier': 'lmeval:mmlu',
           'metadata': {},
           'provider_id': 'lmeval-1',
           'provider_resource_id': 'string',
           'scoring_functions': ['string'],
           'type': 'benchmark'}]}


In [139]:
data = {
    "benchmark_id": "lmeval::mmlu",
    "dataset_id": "lmeval::mmlu",
    "scoring_functions": ["string"],
    "provider_benchmark_id": "string",
    "provider_id": "lmeval-1"
}
response = requests.post(f"{url}/benchmarks", json=data)

In [140]:
# Make the GET request
response = requests.get(f"{url}/benchmarks")
pprint.pprint(response.json())

{'data': [{'dataset_id': 'lmeval::mmlu',
           'identifier': 'lmeval::mmlu',
           'metadata': {},
           'provider_id': 'lmeval-1',
           'provider_resource_id': 'string',
           'scoring_functions': ['string'],
           'type': 'benchmark'},
          {'dataset_id': 'lmeval:mmlu',
           'identifier': 'lmeval:mmlu',
           'metadata': {},
           'provider_id': 'lmeval-1',
           'provider_resource_id': 'string',
           'scoring_functions': ['string'],
           'type': 'benchmark'}]}


In [141]:
logger = logging.getLogger(__name__)

def setup_cluster():
    env = os.environ.copy()
    commands = [
        [
            "oc",
            "apply",
            "-f",
            "https://raw.githubusercontent.com/ruivieira/lls-lmeval-reference/main/manifests/lmeval-sa.yaml"
        ],
        [
            "oc",
            "apply",
            "-f",
            "https://raw.githubusercontent.com/ruivieira/lls-lmeval-reference/main/manifests/lmeval-rbac.yaml"
        ],

    ]
    logger.info("Setting up roles...")
    for command in commands:
        p = subprocess.Popen(args=command, env=env)
        p.wait()

setup_cluster()

serviceaccount/lmeval-sa unchanged
clusterrole.rbac.authorization.k8s.io/lmeval-role unchanged
clusterrolebinding.rbac.authorization.k8s.io/lmeval-rolebinding unchanged
role.rbac.authorization.k8s.io/lmeval-role unchanged
rolebinding.rbac.authorization.k8s.io/lmeval-rolebinding unchanged
role.rbac.authorization.k8s.io/lmeval-role unchanged
rolebinding.rbac.authorization.k8s.io/lmeval-rolebinding unchanged


In [142]:
def enable_online_mode():
    env = os.environ.copy()
    commands = [
        [
            "oc",
            "patch",
            "configmap",
            "trustyai-service-operator-config",
            "-n",
            "redhat-ods-applications",
            "--type",
            "merge",
            "-p",
            '{"data":{"lmes-allow-online":"true","lmes-allow-code-execution":"true"}}'
        ],
        [
           "oc",
           "rollout",
            "restart",
            "deployment",
            "trustyai-service-operator-controller-manager",
            "-n",
            "redhat-ods-applications"
        ],

    ]
    logger.info("Patching the TrustyAI configmap...")
    for command in commands:
        p = subprocess.Popen(args=command, env=env)
        p.wait()

enable_online_mode()

configmap/trustyai-service-operator-config patched (no change)
deployment.apps/trustyai-service-operator-controller-manager restarted


In [143]:
data = {
    "benchmark_id": "lmeval-1::mmlu",
    "benchmark_config": {
        "eval_candidate": {
            "type": "model",
            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            "provider_id": "lmeval-1",
            "sampling_params": {
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 256
            },
        "tokenized_"
        "task_name": "mmlu",
        },
    },

}

response = requests.post(f"{url}/benchmarks/lmeval::mmlu/jobs", json=data)
job_id = response.json()['job_id']
print(f"Job ID: {job_id}")

Job ID: lmeval-job-3


## 3. Get Job Results

In [144]:
response = requests.get(f"{url}/benchmarks/lmeval::mmlu/jobs/lmeval-job-3/result")
print(response.json())

{'generations': [], 'scores': {}}


In [None]:
# clean up
response = requests.delete(f"{url}/benchmarks/lmeval::mmlu/jobs/{job_id}")