Skip to content

Misc. bug: Qwen3-VL on llama-server fails on second request (KV Cache Bug) & /slots/reset returns 501 #17200

@Gaoeee

Description

@Gaoeee

Name and Version

ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
Device 0: Orin, compute capability 8.7, VMM: yes
version: 0 (unknown)
built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for aarch64-linux-gnu

Operating systems

No response

Which llama.cpp modules do you know to be affected?

No response

Command line

./build/bin/llama-server \
  -m models/qwen3-vl/Qwen3VL-8B-Instruct-Q4_K_M.gguf \
  --mmproj models/qwen3-vl/mmproj-Qwen3VL-8B-Instruct-F16.gguf \
  --ctx-size 4096 \
  --batch-size 1024 \
  --gpu-layers 99 \
  --flash-attn on \
  -t 8 \
  --image-max-tokens 1024 \
  --parallel 1 \
  --host 0.0.0.0 --port 8080

Problem description & steps to reproduce

When running the Qwen3-VL-8B-Instruct model using llama-server, sending two consecutive multimodal (VLM) requests results in the failure of the second request. The output of the second request becomes gibberish (????...) or stops after reaching the maximum token length. This indicates that the KV Cache (Key-Value Cache) is not being correctly released or managed after the initial VLM evaluation, leading to model state corruption.

Compounding the issue, the API endpoint intended for manually clearing the state, /slots/reset, returns HTTP 501: Not Implemented, preventing an API-based workaround.

But the WebUI work well.

#!/usr/bin/env python3
"""
llama_client.py — Minimal client for llama.cpp server (Python 3.6+ compatible)
- Endpoints: /completion  and  /v1/chat/completions
- Image support (data URL base64)
- Supports: --stop (repeatable), --grammar (completion only)
- Pure urllib (no extra deps)
"""

import argparse
import base64
import json
import mimetypes
import pathlib
import sys
from typing import Optional, List
from urllib import request, error

def data_url_from_image(path: str) -> str:
    p = pathlib.Path(path)
    b = p.read_bytes()
    mime, _ = mimetypes.guess_type(p.name)
    if not mime:
        mime = "image/jpeg" if p.suffix.lower() in (".jpg", ".jpeg") else "image/png"
    return "data:{mime};base64,{b64}".format(
        mime=mime, b64=base64.b64encode(b).decode("ascii")
    )

def http_post_json(url: str, payload: dict, timeout: int = 120) -> str:
    data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
    req = request.Request(url, data=data, headers={"Content-Type": "application/json"})
    try:
        with request.urlopen(req, timeout=timeout) as r:
            return r.read().decode("utf-8", errors="replace")
    except error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        print("[HTTP {code}] {reason}\n{body}".format(code=e.code, reason=e.reason, body=body), file=sys.stderr)
        raise
    except Exception as e:
        print("[ERROR] {e}".format(e=e), file=sys.stderr)
        raise

def reset_server_slots(base_url: str) -> None:
    """Sends a POST request to /slots/reset to clear the KV cache and server state."""
    url = base_url.rstrip("/") + "/slots/reset"
    print(f"\n--- [RESET] 尝试重置服务器状态: {url} ---")
    
    # POST请求需要有数据体,即使是空JSON
    data = json.dumps({}).encode("utf-8")
    req = request.Request(url, data=data, headers={"Content-Type": "application/json"}, method='POST')
    
    try:
        with request.urlopen(req, timeout=5) as r:

            print(f"--- [RESET] 服务器重置成功。---")
    except error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        print(f"--- [RESET ERROR] HTTP {e.code}: {e.reason} ---", file=sys.stderr)
    except Exception as e:
        print(f"--- [RESET ERROR] 重置失败,可能端点名称不正确: {e} ---", file=sys.stderr)


def run_completion(base_url: str, prompt: str, image: Optional[str],
                   n_predict: int, temperature: float,
                   stops: List[str], grammar: Optional[str], timeout: int) -> None:
    url = base_url.rstrip("/") + "/completion"
    payload = {
        "prompt": prompt,
        "temperature": temperature,
        "n_predict": n_predict,
    }
    if image:
        payload["images"] = [data_url_from_image(image)]
    if stops:
        payload["stop"] = stops
    if grammar:
        payload["grammar"] = grammar
    print(http_post_json(url, payload, timeout))

def run_chat(base_url: str, prompt: str, model: str, image: Optional[str],
             max_tokens: int, temperature: float,
             stops: List[str], fmt: str, timeout: int) -> None:
    """
    fmt: "image_url" 
    """
    url = base_url.rstrip("/") + "/v1/chat/completions"
    content = [{"type": "text", "text": prompt}]
    if image:
        img = data_url_from_image(image)
        if fmt == "image_url":
            content.append({"type": "image_url", "image_url": {"url": img}})
        else:
            content.append({"type": "input_image", "image_url": {"url": img}})

    payload = {
        "model": model, 
        "messages": [
            {"role": "system", "content": "You are a helpful multimodal assistant."},
            {"role": "user", "content": content},
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
        "cache_prompt": False,
        "slot_id": -1,         
    }
    if stops:
        payload["stop"] = stops
    print(http_post_json(url, payload, timeout))

def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--url", required=True, help="e.g. http://127.0.0.1:8080")
    ap.add_argument("--endpoint", choices=["completion", "chat"], default="completion")
    ap.add_argument("-p", "--prompt", required=True)
    ap.add_argument("-i", "--image", help="optional image path")
    ap.add_argument("--temperature", type=float, default=0.0)
    ap.add_argument("--n-predict", type=int, default=64, help="for /completion")
    ap.add_argument("--max-tokens", type=int, default=128, help="for /v1/chat/completions")
    ap.add_argument("--model", default="qwen3-vl-8b-instruct",
                    help="chat endpoint 'model' name (server --alias)")
    ap.add_argument("--stop", action="append", default=[],
                    help="stop word(s); can repeat: --stop 。 --stop \\n")
    ap.add_argument("--grammar", help="llama.cpp grammar (completion only)")
    ap.add_argument("--fmt", choices=["image_url", "input_image"], default="image_url",
                    help="chat endpoint image field type")
    ap.add_argument("--timeout", type=int, default=120)
    
    ap.add_argument("--reset-before-run", action="store_true", 
                    help="Forces a /slots/reset call before starting the task.")
    
    args = ap.parse_args()

    if args.reset_before_run:
        reset_server_slots(args.url)

    if args.endpoint == "completion":
        run_completion(args.url, args.prompt, args.image,
                       args.n_predict, args.temperature,
                       args.stop, args.grammar, args.timeout)
    else:
        run_chat(args.url, args.prompt, args.model, args.image,
                 args.max_tokens, args.temperature,
                 args.stop, args.fmt, args.timeout)


if __name__ == "__main__":
    main()

First Bad Commit

No response

Relevant log output

server log:
main: server is listening on http://0.0.0.0:8080 - starting the main loop srv update_slots: all slots are idle srv update_slots: all slots are idle srv log_server_r: request: GET /slots 127.0.0.1 200 srv params_from_: Chat format: Content-only slot get_availabl: id 3 | task -1 | selected slot by LRU, t_last = -1 slot launch_slot_: id 3 | task -1 | sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist slot launch_slot_: id 3 | task 1 | processing task slot update_slots: id 3 | task 1 | new prompt, n_ctx_slot = 4096, n_keep = 0, task.n_tokens = 1043 slot update_slots: id 3 | task 1 | n_tokens = 0, memory_seq_rm [0, end) slot update_slots: id 3 | task 1 | prompt processing progress, n_tokens = 29, batch.n_tokens = 29, progress = 0.027804 slot update_slots: id 3 | task 1 | n_tokens = 29, memory_seq_rm [29, end) srv process_chun: processing image... encoding image slice... image slice encoded in 7387 ms decoding image batch 1/1, n_tokens_batch = 1008 image decoded (batch 1/1) in 1776 ms srv process_chun: image processed in 9164 ms slot update_slots: id 3 | task 1 | prompt processing progress, n_tokens = 1043, batch.n_tokens = 6, progress = 1.000000 slot update_slots: id 3 | task 1 | prompt done, n_tokens = 1043, batch.n_tokens = 6 slot print_timing: id 3 | task 1 | prompt eval time = 10907.48 ms / 1043 tokens ( 10.46 ms per token, 95.62 tokens per second) eval time = 133.06 ms / 2 tokens ( 66.53 ms per token, 15.03 tokens per second) total time = 11040.53 ms / 1045 tokens slot release: id 3 | task 1 | stop processing: n_tokens = 1044, truncated = 0 srv update_slots: all slots are idle srv log_server_r: request: POST /v1/chat/completions 127.0.0.1 200 srv params_from_: Chat format: Content-only slot get_availabl: id 3 | task -1 | selected slot by LCP similarity, sim_best = 1.000 (> 0.100 thold), f_keep = 0.999 slot launch_slot_: id 3 | task -1 | sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist slot launch_slot_: id 3 | task 5 | processing task slot update_slots: id 3 | task 5 | new prompt, n_ctx_slot = 4096, n_keep = 0, task.n_tokens = 1043 slot update_slots: id 3 | task 5 | n_tokens = 0, memory_seq_rm [0, end) slot update_slots: id 3 | task 5 | prompt processing progress, n_tokens = 29, batch.n_tokens = 29, progress = 0.027804 slot update_slots: id 3 | task 5 | n_tokens = 29, memory_seq_rm [29, end) srv process_chun: processing image... encoding image slice... image slice encoded in 5033 ms decoding image batch 1/1, n_tokens_batch = 1008 image decoded (batch 1/1) in 1695 ms srv process_chun: image processed in 6728 ms slot update_slots: id 3 | task 5 | prompt processing progress, n_tokens = 1043, batch.n_tokens = 6, progress = 1.000000 slot update_slots: id 3 | task 5 | prompt done, n_tokens = 1043, batch.n_tokens = 6 slot print_timing: id 3 | task 5 | prompt eval time = 8521.15 ms / 1043 tokens ( 8.17 ms per token, 122.40 tokens per second) eval time = 107.19 ms / 2 tokens ( 53.59 ms per token, 18.66 tokens per second) total time = 8628.34 ms / 1045 tokens slot release: id 3 | task 5 | stop processing: n_tokens = 1044, truncated = 0 srv update_slots: all slots are idle srv log_server_r: request: POST /v1/chat/completions 127.0.0.1 200 srv params_from_: Chat format: Content-only slot get_availabl: id 2 | task -1 | selected slot by LRU, t_last = -1 slot launch_slot_: id 2 | task -1 | sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist slot launch_slot_: id 2 | task 9 | processing task slot update_slots: id 2 | task 9 | new prompt, n_ctx_slot = 4096, n_keep = 0, task.n_tokens = 1043 slot update_slots: id 2 | task 9 | n_tokens = 0, memory_seq_rm [0, end) slot update_slots: id 2 | task 9 | prompt processing progress, n_tokens = 29, batch.n_tokens = 29, progress = 0.027804 slot update_slots: id 2 | task 9 | n_tokens = 29, memory_seq_rm [29, end) srv process_chun: processing image... encoding image slice... image slice encoded in 5013 ms decoding image batch 1/1, n_tokens_batch = 1008 image decoded (batch 1/1) in 1794 ms srv process_chun: image processed in 6807 ms slot update_slots: id 2 | task 9 | prompt processing progress, n_tokens = 1043, batch.n_tokens = 6, progress = 1.000000 slot update_slots: id 2 | task 9 | prompt done, n_tokens = 1043, batch.n_tokens = 6 slot print_timing: id 2 | task 9 | prompt eval time = 8686.32 ms / 1043 tokens ( 8.33 ms per token, 120.07 tokens per second) eval time = 12585.55 ms / 128 tokens ( 98.32 ms per token, 10.17 tokens per second) total time = 21271.88 ms / 1171 tokens srv log_server_r: request: POST /v1/chat/completions 127.0.0.1 200 slot release: id 2 | task 9 | stop processing: n_tokens = 1170, truncated = 0 srv update_slots: all slots are idle


client log:
jetson@jetson:~/VLM/llama.cpp/service_test$ python3 llama_client.py --url http://127.0.0.1:8080 --endpoint chat   -p "工人有没有戴手套,只回答是否。" -i Hat_detection_1_1762420687.jpg  --fmt image_url

{"choices":[{"finish_reason":"stop","index":0,"message":{"role":"assistant","content":""}}],"created":1762912501,"model":"qwen3-vl-8b-instruct","system_fingerprint":"b0-unknown","object":"chat.completion","usage":{"completion_tokens":2,"prompt_tokens":1040,"total_tokens":1042},"id":"chatcmpl-u27eSy92FPEyzuGg9tfulvj5aC4A18Z1","timings":{"cache_n":0,"prompt_n":1040,"prompt_ms":10969.18,"prompt_per_token_ms":10.547288461538463,"prompt_per_second":94.8110980036794,"predicted_n":2,"predicted_ms":114.441,"predicted_per_token_ms":57.2205,"predicted_per_second":17.4762541396877}}

jetson@jetson:~/VLM/llama.cpp/service_test$ python3 llama_client.py --url http://127.0.0.1:8080 --endpoint chat   -p "地面有没有水渍,只回答是否。" -i Hat_detection_1_1762486543.jpg  --fmt image_url

{"choices":[{"finish_reason":"length","index":0,"message":{"role":"assistant","content":"????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"}}],"created":1762912540,"model":"qwen3-vl-8b-instruct","system_fingerprint":"b0-unknown","object":"chat.completion","usage":{"completion_tokens":128,"prompt_tokens":1040,"total_tokens":1168},"id":"chatcmpl-AzFxCZWiu0UHVljcKv6tQU6xoVWedu9L","timings":{"cache_n":0,"prompt_n":1040,"prompt_ms":8628.116,"prompt_per_token_ms":8.296265384615385,"prompt_per_second":120.5361633988231,"predicted_n":128,"predicted_ms":12670.28,"predicted_per_token_ms":98.9865625,"predicted_per_second":10.102381320696937}}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions