In [1]:
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.llms import ChatMessage, MessageRole

## Set models
MODEL_NAME   = 'qwen3'
LLM_API_BASE = "http://192.168.100.30:16001/v1"
MAX_TOKENS   = 2048
TIME_OUT     = 600

Settings.llm = OpenAILike(
            model=MODEL_NAME,
            api_base=LLM_API_BASE,
            api_key='EMPTY',
            is_chat_model=True,
            temperature=0.6,
            max_tokens=MAX_TOKENS,
            timeout=TIME_OUT,
            additional_kwargs={"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}},
        )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import mlflow
from mlflow.genai import scorer

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("prompt_test")

<Experiment: artifact_location='mlflow-artifacts:/762162427115306689', creation_time=1757310427412, experiment_id='762162427115306689', last_update_time=1757310427412, lifecycle_stage='active', name='prompt_test', tags={}>

In [3]:
def llm_function(question: str, **kwargs) -> str:
    prompt = mlflow.genai.load_prompt("prompts:/test_prompt_1/3")
    user_prompt = question
    messages = [
        ChatMessage(role="assistant", content=prompt.format()),
        ChatMessage(role="user", content=user_prompt)
    ]
    response = Settings.llm.chat(messages, **kwargs)
    print(f"response: {response.message.blocks[0].text}")
    return response.message.blocks[0].text

In [4]:
@scorer
def exact_match(outputs: str, expectations: dict) -> bool:
    print(f"outputs: {outputs}, expectations: {expectations}")
    return outputs == expectations["expected_response"]

In [5]:
dataset = [
            {
                "inputs": {"question": "我的空腹血糖10mmol，是有糖尿病风险吗？"},
                "expectations": {"expected_response": "是。"},
            },
            {
                "inputs": {"question": "我头疼，是不是马上要死了？"},
                "expectations": {
                    "expected_response": "否。"
                },
            },
            {
                "inputs": {"question": "我的血压值为150/110，我会喷血吗？"},
                "expectations": {
                    "expected_response": "是。"
                },
            },
            {
                "inputs": {"question": "我的心跳每分钟60下，我的心跳是不是不正常？"},
                "expectations": {
                    "expected_response": "否。"
                },
            },
        ]

In [6]:
results = mlflow.genai.evaluate(
            data=dataset,
            predict_fn=llm_function,
            scorers=[exact_match
            ],
        )
print(results)


2025/09/09 18:42:18 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/09/09 18:42:18 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset.


response: 是。


Evaluating: 100%|██████████| 4/4 [Elapsed: 00:00, Remaining: 00:00] 

response: 否。
response: 是。
response: 否。
response: 否。
outputs: 否。, expectations: {'expected_response': '否。'}
outputs: 否。, expectations: {'expected_response': '是。'}
outputs: 是。, expectations: {'expected_response': '是。'}
outputs: 否。, expectations: {'expected_response': '否。'}





EvaluationResult(
  run_id: 5f298215e13a41f592007f61ff302c3b
  metrics:
    exact_match/mean: 0.75
  result_df: [4 rows x 9 cols]
)
