From aa6192d38defc9a5d9fb2987413f71b18f214270 Mon Sep 17 00:00:00 2001 From: noname Date: Tue, 25 Nov 2025 17:05:26 +0100 Subject: [PATCH 1/9] server : add Anthropic Messages API support --- tools/server/README.md | 72 ++ tools/server/server-common.cpp | 437 ++++++++- tools/server/server-common.h | 9 + tools/server/server-http.cpp | 9 +- tools/server/server-task.cpp | 284 ++++++ tools/server/server-task.h | 7 + tools/server/server.cpp | 62 +- tools/server/tests/unit/test_anthropic_api.py | 838 ++++++++++++++++++ 8 files changed, 1712 insertions(+), 6 deletions(-) create mode 100644 tools/server/tests/unit/test_anthropic_api.py diff --git a/tools/server/README.md b/tools/server/README.md index 8fd478eb328..c4e1759e4fd 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes + * [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions * Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510) * Parallel decoding with multi-user support * Continuous batching @@ -1343,6 +1344,77 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +### POST `/v1/messages`: Anthropic-compatible Messages API + +Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps. + +*Options:* + +See [Anthropic Messages API documentation](https://docs.anthropic.com/en/api/messages). Tool use requires `--jinja` flag. + +`model`: Model identifier (required) + +`messages`: Array of message objects with `role` and `content` (required) + +`max_tokens`: Maximum tokens to generate (default: 4096) + +`system`: System prompt as string or array of content blocks + +`temperature`: Sampling temperature 0-1 (default: 1.0) + +`top_p`: Nucleus sampling (default: 1.0) + +`top_k`: Top-k sampling + +`stop_sequences`: Array of stop sequences + +`stream`: Enable streaming (default: false) + +`tools`: Array of tool definitions (requires `--jinja`) + +`tool_choice`: Tool selection mode (`{"type": "auto"}`, `{"type": "any"}`, or `{"type": "tool", "name": "..."}`) + +*Examples:* + +```shell +curl http://localhost:8080/v1/messages \ + -H "Content-Type: application/json" \ + -H "x-api-key: your-api-key" \ + -d '{ + "model": "gpt-4", + "max_tokens": 1024, + "system": "You are a helpful assistant.", + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' +``` + +### POST `/v1/messages/count_tokens`: Token Counting + +Counts the number of tokens in a request without generating a response. + +Accepts the same parameters as `/v1/messages`. The `max_tokens` parameter is not required. + +*Example:* + +```shell +curl http://localhost:8080/v1/messages/count_tokens \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4", + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' +``` + +*Response:* + +```json +{"input_tokens": 10} +``` + ## More examples ### Interactive mode diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 18328f3afbd..523004977fa 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -593,6 +593,29 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, return prompt_tokens; } +std::string format_anthropic_sse(const json & data) { + std::ostringstream ss; + + auto send_event = [&ss](const json & event_obj) { + if (event_obj.contains("event") && event_obj.contains("data")) { + ss << "event: " << event_obj.at("event").get() << "\n"; + ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; + } else { + ss << "data: " << safe_json_to_str(event_obj) << "\n\n"; + } + }; + + if (data.is_array()) { + for (const auto & event : data) { + send_event(event); + } + } else { + send_event(data); + } + + return ss.str(); +} + size_t validate_utf8(const std::string& text) { size_t len = text.size(); if (len == 0) return 0; @@ -725,7 +748,6 @@ std::vector tokenize_input_prompts(const llama_vocab * vocab, mtm return result; } - // // OAI utils // @@ -1048,6 +1070,419 @@ json oaicompat_chat_params_parse( return llama_params; } +json anthropic_params_from_json( + const json & body_in, /* anthropic messages api json semantics */ + const oaicompat_parser_options & opt, + std::vector & out_files) +{ + json body = body_in; + json llama_params; + + if (body.contains("stop_sequences")) { + llama_params["stop"] = body.at("stop_sequences"); + } else { + llama_params["stop"] = json::array(); + } + + // handle max_tokens (required in Anthropic, but we're permissive) + if (!body.contains("max_tokens")) { + llama_params["n_predict"] = 4096; + } else { + llama_params["n_predict"] = body.at("max_tokens"); + } + + if (body.contains("top_k")) { + llama_params["top_k"] = body.at("top_k"); + } + + if (body.contains("thinking")) { + json thinking = json_value(body, "thinking", json::object()); + std::string thinking_type = json_value(thinking, "type", std::string()); + if (thinking_type == "enabled") { + int budget_tokens = json_value(thinking, "budget_tokens", 10000); + llama_params["thinking_budget_tokens"] = budget_tokens; + } + } + + if (body.contains("metadata")) { + json metadata = json_value(body, "metadata", json::object()); + std::string user_id = json_value(metadata, "user_id", std::string()); + if (!user_id.empty()) { + llama_params["__metadata_user_id"] = user_id; + } + } + + json oai_messages = json::array(); + auto system_param = json_value(body, "system", json()); + if (!system_param.is_null()) { + std::string system_content; + + if (system_param.is_string()) { + system_content = system_param.get(); + } else if (system_param.is_array()) { + for (const auto & block : system_param) { + if (json_value(block, "type", std::string()) == "text") { + system_content += json_value(block, "text", std::string()); + } + } + } + + oai_messages.push_back({ + {"role", "system"}, + {"content", system_content} + }); + } + + if (!body.contains("messages")) { + throw std::runtime_error("'messages' is required"); + } + json & messages = body.at("messages"); + if (!messages.is_array()) { + throw std::runtime_error("Expected 'messages' to be an array"); + } + + for (auto & msg : messages) { + std::string role = json_value(msg, "role", std::string()); + if (role != "assistant" && !msg.contains("content")) { + throw std::runtime_error("All non-assistant messages must contain 'content'"); + } + if (role == "assistant") { + if (!msg.contains("content")) { + continue; + } + } + + json & content = msg.at("content"); + + if (content.is_string()) { + oai_messages.push_back(msg); + continue; + } + + if (!content.is_array()) { + throw std::runtime_error("Expected 'content' to be a string or an array"); + } + + json tool_calls = json::array(); + json converted_content = json::array(); + json tool_results = json::array(); + bool has_tool_calls = false; + + for (auto & block : content) { + std::string type = json_value(block, "type", std::string()); + + if (type == "text") { + converted_content.push_back(block); + } else if (type == "image") { + json source = json_value(block, "source", json::object()); + std::string source_type = json_value(source, "type", std::string()); + + if (source_type == "base64") { + std::string media_type = json_value(source, "media_type", std::string("image/jpeg")); + std::string data = json_value(source, "data", std::string()); + + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", "data:" + media_type + ";base64," + data} + }} + }); + } else if (source_type == "url") { + std::string url = json_value(source, "url", std::string()); + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", url} + }} + }); + } + } else if (type == "tool_use") { + tool_calls.push_back({ + {"id", json_value(block, "id", std::string())}, + {"type", "function"}, + {"function", { + {"name", json_value(block, "name", std::string())}, + {"arguments", json_value(block, "input", json::object()).dump()} + }} + }); + has_tool_calls = true; + } else if (type == "tool_result") { + std::string tool_use_id = json_value(block, "tool_use_id", std::string()); + + auto result_content = json_value(block, "content", json()); + std::string result_text; + if (result_content.is_string()) { + result_text = result_content.get(); + } else if (result_content.is_array()) { + for (const auto & c : result_content) { + if (json_value(c, "type", std::string()) == "text") { + result_text += json_value(c, "text", std::string()); + } + } + } + + tool_results.push_back({ + {"role", "tool"}, + {"tool_call_id", tool_use_id}, + {"content", result_text} + }); + } + } + + if (!tool_results.empty()) { + if (!converted_content.empty() || has_tool_calls) { + json new_msg = {{"role", role}}; + if (!converted_content.empty()) { + new_msg["content"] = converted_content; + } else if (has_tool_calls) { + new_msg["content"] = ""; + } + if (!tool_calls.empty()) { + new_msg["tool_calls"] = tool_calls; + } + oai_messages.push_back(new_msg); + } + for (const auto & tool_msg : tool_results) { + oai_messages.push_back(tool_msg); + } + } else { + if (!converted_content.empty() || has_tool_calls) { + json new_msg = {{"role", role}}; + if (!converted_content.empty()) { + new_msg["content"] = converted_content; + } else if (has_tool_calls) { + new_msg["content"] = ""; + } + if (!tool_calls.empty()) { + new_msg["tool_calls"] = tool_calls; + } + oai_messages.push_back(new_msg); + } + } + } + + json oai_tools = json::array(); + if (body.contains("tools")) { + json & tools = body.at("tools"); + if (tools.is_array()) { + for (auto & tool : tools) { + oai_tools.push_back({ + {"type", "function"}, + {"function", { + {"name", json_value(tool, "name", std::string())}, + {"description", json_value(tool, "description", std::string())}, + {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()} + }} + }); + } + } + } + + std::string oai_tool_choice = "auto"; + if (body.contains("tool_choice")) { + json & tc = body.at("tool_choice"); + if (tc.is_object()) { + std::string type = json_value(tc, "type", std::string()); + if (type == "auto") { + oai_tool_choice = "auto"; + } else if (type == "any") { + oai_tool_choice = "required"; + } else if (type == "tool") { + oai_tool_choice = "required"; + } + } + } + + for (auto & msg : oai_messages) { + if (!msg.contains("content")) { + continue; + } + json & content = msg.at("content"); + if (content.is_string() || content.is_null()) { + continue; + } + if (!content.is_array()) { + continue; + } + + for (auto & p : content) { + std::string type = json_value(p, "type", std::string()); + if (type == "image_url") { + if (!opt.allow_image) { + throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); + } + + json image_url = json_value(p, "image_url", json::object()); + std::string url = json_value(image_url, "url", std::string()); + if (string_starts_with(url, "http")) { + // download remote image + common_remote_params params; + params.headers.push_back("User-Agent: llama.cpp/" + build_info); + params.max_size = 1024 * 1024 * 10; // 10MB + params.timeout = 10; // seconds + SRV_INF("downloading image from '%s'\n", url.c_str()); + auto res = common_remote_get_content(url, params); + if (200 <= res.first && res.first < 300) { + SRV_INF("downloaded %ld bytes\n", res.second.size()); + raw_buffer data; + data.insert(data.end(), res.second.begin(), res.second.end()); + out_files.push_back(data); + } else { + throw std::runtime_error("Failed to download image"); + } + } else { + // try to decode base64 image + std::vector parts = string_split(url, /*separator*/ ','); + if (parts.size() != 2) { + throw std::runtime_error("Invalid image_url.url value"); + } else if (!string_starts_with(parts[0], "data:image/")) { + throw std::runtime_error("Invalid image_url.url format: " + parts[0]); + } else if (!string_ends_with(parts[0], "base64")) { + throw std::runtime_error("image_url.url must be base64 encoded"); + } else { + auto base64_data = parts[1]; + auto decoded_data = base64_decode(base64_data); + out_files.push_back(decoded_data); + } + } + + // replace this chunk with a marker + p["type"] = "text"; + p["text"] = mtmd_default_marker(); + p.erase("image_url"); + } else if (type == "input_audio") { + if (!opt.allow_audio) { + throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); + } + + json input_audio = json_value(p, "input_audio", json::object()); + std::string data = json_value(input_audio, "data", std::string()); + std::string format = json_value(input_audio, "format", std::string()); + if (format != "wav" && format != "mp3") { + throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'"); + } + auto decoded_data = base64_decode(data); + out_files.push_back(decoded_data); + + // replace this chunk with a marker + p["type"] = "text"; + p["text"] = mtmd_default_marker(); + p.erase("input_audio"); + } + } + } + + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(oai_messages); + inputs.tools = common_chat_tools_parse_oaicompat(oai_tools); + inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(oai_tool_choice); + inputs.json_schema = ""; + inputs.grammar = ""; + inputs.use_jinja = opt.use_jinja; + inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); + inputs.reasoning_format = opt.reasoning_format; + inputs.enable_thinking = opt.enable_thinking; + + if (opt.enable_thinking && opt.prefill_assistant) { + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant") { + inputs.enable_thinking = false; + } + } + + if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) { + llama_params["parse_tool_calls"] = true; + } + + // merge the template args provided from command line with the args provided in the user request + auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); + inputs.chat_template_kwargs = opt.chat_template_kwargs; + for (const auto & item : chat_template_kwargs_object.items()) { + inputs.chat_template_kwargs[item.key()] = item.value().dump(); + } + + // parse the "enable_thinking" kwarg to override the default value + auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string("")); + if (enable_thinking_kwarg == "true") { + inputs.enable_thinking = true; + } else if (enable_thinking_kwarg == "false") { + inputs.enable_thinking = false; + } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') { + throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)"); + } + + // if the assistant message appears at the end of list, we do not add end-of-turn token + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; + common_chat_msg last_message; + if (prefill_assistant_message) { + last_message = inputs.messages.back(); + inputs.messages.pop_back(); + + // sanity check, max one assistant message at the end of the list + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + + inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; + + if (inputs.enable_thinking) { + throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); + } + + inputs.add_generation_prompt = true; + } + + // Apply chat template to the list of messages + auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); + + // Append assistant prefilled message + if (prefill_assistant_message) { + if (!last_message.content_parts.empty()) { + for (auto & p : last_message.content_parts) { + chat_params.prompt += p.text; + } + } else { + chat_params.prompt += last_message.content; + } + } + + llama_params["chat_format"] = static_cast(chat_params.format); + llama_params["prompt"] = chat_params.prompt; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } + llama_params["grammar_lazy"] = chat_params.grammar_lazy; + auto grammar_triggers = json::array(); + for (const auto & trigger : chat_params.grammar_triggers) { + server_grammar_trigger ct(trigger); + grammar_triggers.push_back(ct.to_json()); + } + llama_params["grammar_triggers"] = grammar_triggers; + llama_params["preserved_tokens"] = chat_params.preserved_tokens; + llama_params["thinking_forced_open"] = chat_params.thinking_forced_open; + for (const auto & stop : chat_params.additional_stops) { + llama_params["stop"].push_back(stop); + } + + // Handle "n" field + int n_choices = json_value(body, "n", 1); + if (n_choices != 1) { + throw std::runtime_error("Only one completion choice is allowed"); + } + + // Copy remaining properties to llama_params + // This allows user to use llama.cpp-specific params like "mirostat", ... via Anthropic endpoint. + // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp + for (const auto & item : body.items()) { + // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" + if (!llama_params.contains(item.key()) || item.key() == "n_predict") { + llama_params[item.key()] = item.value(); + } + } + + return llama_params; +} + json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) { json data = json::array(); int32_t n_tokens = 0; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 868c5061031..4b393fd8b25 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -294,6 +294,12 @@ json oaicompat_chat_params_parse( const oaicompat_parser_options & opt, std::vector & out_files); +// used by Anthropic /v1/messages endpoint +json anthropic_params_from_json( + const json & body, /* anthropic messages api json semantics */ + const oaicompat_parser_options & opt, + std::vector & out_files); + // TODO: move it to server-task.cpp json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false); @@ -322,6 +328,9 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l // note: if data is a json array, it will be sent as multiple events, one per item std::string format_sse(const json & data); +// format Anthropic-style SSE with event types +std::string format_anthropic_sse(const json & data); + bool is_valid_utf8(const std::string & str); // diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index a82aa86b19e..fe532090100 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -136,7 +136,7 @@ bool server_http_context::init(const common_params & params) { return true; } - // Check for API key in the header + // Check for API key in the Authorization header auto auth_header = req.get_header_value("Authorization"); std::string prefix = "Bearer "; @@ -147,6 +147,13 @@ bool server_http_context::init(const common_params & params) { } } + // Check for API key in the x-api-key header + auto x_api_key_header = req.get_header_value("X-Api-Key"); + + if (std::find(api_keys.begin(), api_keys.end(), x_api_key_header) != api_keys.end()) { + return true; // API key is valid + } + // API key is invalid or not provided res.status = 401; res.set_content( diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index bc4436ba65b..aff9bf66b71 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -7,6 +7,8 @@ #include "sampling.h" #include "json-schema-to-grammar.h" +#include + using json = nlohmann::ordered_json; // @@ -572,6 +574,8 @@ json server_task_result_cmpl_final::to_json() { return to_json_oaicompat(); case OAICOMPAT_TYPE_CHAT: return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); + case OAICOMPAT_TYPE_ANTHROPIC: + return stream ? to_json_anthropic_stream() : to_json_anthropic(); default: GGML_ASSERT(false && "Invalid oaicompat_type"); } @@ -768,6 +772,188 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { return deltas; } +json server_task_result_cmpl_final::to_json_anthropic() { + std::string stop_reason = "max_tokens"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; + } + + json content_blocks = json::array(); + + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } else { + msg.role = "assistant"; + msg.content = content; + } + + if (!msg.content.empty()) { + content_blocks.push_back({ + {"type", "text"}, + {"text", msg.content} + }); + } + + for (const auto & tool_call : msg.tool_calls) { + json tool_use_block = { + {"type", "tool_use"}, + {"id", tool_call.id}, + {"name", tool_call.name} + }; + + try { + tool_use_block["input"] = json::parse(tool_call.arguments); + } catch (const std::exception &) { + tool_use_block["input"] = json::object(); + } + + content_blocks.push_back(tool_use_block); + } + + json res = { + {"id", oaicompat_cmpl_id}, + {"type", "message"}, + {"role", "assistant"}, + {"content", content_blocks}, + {"model", oaicompat_model}, + {"stop_reason", stop_reason}, + {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}, + {"usage", { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded} + }} + }; + + return res; +} + +json server_task_result_cmpl_final::to_json_anthropic_stream() { + json events = json::array(); + + std::string stop_reason = "max_tokens"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; + } + + bool has_text = !oaicompat_msg.content.empty(); + size_t num_tool_calls = oaicompat_msg.tool_calls.size(); + + bool text_block_started = false; + std::set tool_calls_started; + + for (const auto & diff : oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (!text_block_started) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", 0}, + {"content_block", { + {"type", "text"}, + {"text", ""} + }} + }} + }); + text_block_started = true; + } + + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", 0}, + {"delta", { + {"type", "text_delta"}, + {"text", diff.content_delta} + }} + }} + }); + } + + if (diff.tool_call_index != std::string::npos) { + size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index; + + if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) { + const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index]; + + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", content_block_index}, + {"content_block", { + {"type", "tool_use"}, + {"id", full_tool_call.id}, + {"name", full_tool_call.name} + }} + }} + }); + tool_calls_started.insert(diff.tool_call_index); + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", content_block_index}, + {"delta", { + {"type", "input_json_delta"}, + {"partial_json", diff.tool_call_delta.arguments} + }} + }} + }); + } + } + } + + if (has_text) { + events.push_back({ + {"event", "content_block_stop"}, + {"data", { + {"type", "content_block_stop"}, + {"index", 0} + }} + }); + } + + for (size_t i = 0; i < num_tool_calls; i++) { + size_t content_block_index = (has_text ? 1 : 0) + i; + events.push_back({ + {"event", "content_block_stop"}, + {"data", { + {"type", "content_block_stop"}, + {"index", content_block_index} + }} + }); + } + + events.push_back({ + {"event", "message_delta"}, + {"data", { + {"type", "message_delta"}, + {"delta", { + {"stop_reason", stop_reason}, + {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)} + }}, + {"usage", { + {"output_tokens", n_decoded} + }} + }} + }); + + events.push_back({ + {"event", "message_stop"}, + {"data", { + {"type", "message_stop"} + }} + }); + + return events; +} + // // server_task_result_cmpl_partial // @@ -779,6 +965,8 @@ json server_task_result_cmpl_partial::to_json() { return to_json_oaicompat(); case OAICOMPAT_TYPE_CHAT: return to_json_oaicompat_chat(); + case OAICOMPAT_TYPE_ANTHROPIC: + return to_json_anthropic(); default: GGML_ASSERT(false && "Invalid oaicompat_type"); } @@ -936,6 +1124,102 @@ json server_task_result_rerank::to_json() { }; } +json server_task_result_cmpl_partial::to_json_anthropic() { + json events = json::array(); + bool first = (n_decoded == 1); + static bool text_block_started = false; + + if (first) { + text_block_started = false; + + events.push_back({ + {"event", "message_start"}, + {"data", { + {"type", "message_start"}, + {"message", { + {"id", oaicompat_cmpl_id}, + {"type", "message"}, + {"role", "assistant"}, + {"content", json::array()}, + {"model", oaicompat_model}, + {"stop_reason", nullptr}, + {"stop_sequence", nullptr}, + {"usage", { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", 0} + }} + }} + }} + }); + } + + for (const auto & diff : oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (!text_block_started) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", 0}, + {"content_block", { + {"type", "text"}, + {"text", ""} + }} + }} + }); + text_block_started = true; + } + + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", 0}, + {"delta", { + {"type", "text_delta"}, + {"text", diff.content_delta} + }} + }} + }); + } + + if (diff.tool_call_index != std::string::npos) { + size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index; + + if (!diff.tool_call_delta.name.empty()) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", content_block_index}, + {"content_block", { + {"type", "tool_use"}, + {"id", diff.tool_call_delta.id}, + {"name", diff.tool_call_delta.name} + }} + }} + }); + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", content_block_index}, + {"delta", { + {"type", "input_json_delta"}, + {"partial_json", diff.tool_call_delta.arguments} + }} + }} + }); + } + } + } + + return events; +} + // // server_task_result_error // diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 0271caae116..b96c00a96a5 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -32,6 +32,7 @@ enum oaicompat_type { OAICOMPAT_TYPE_CHAT, OAICOMPAT_TYPE_COMPLETION, OAICOMPAT_TYPE_EMBEDDING, + OAICOMPAT_TYPE_ANTHROPIC, }; enum stop_type { @@ -253,6 +254,10 @@ struct server_task_result_cmpl_final : server_task_result { json to_json_oaicompat_chat(); json to_json_oaicompat_chat_stream(); + + json to_json_anthropic(); + + json to_json_anthropic_stream(); }; struct server_task_result_cmpl_partial : server_task_result { @@ -292,6 +297,8 @@ struct server_task_result_cmpl_partial : server_task_result { json to_json_oaicompat(); json to_json_oaicompat_chat(); + + json to_json_anthropic(); }; struct server_task_result_embd : server_task_result { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0f39def3794..dd36c400821 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2991,6 +2991,38 @@ struct server_routes { OAICOMPAT_TYPE_CHAT); }; + server_http_context::handler_t post_anthropic_messages = [this](const server_http_req & req) { + std::vector files; + json body = json::parse(req.body); + json body_parsed = anthropic_params_from_json( + body, + ctx_server.oai_parser_opt, + files); + return handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + req.should_stop, + OAICOMPAT_TYPE_ANTHROPIC); + }; + + server_http_context::handler_t post_anthropic_count_tokens = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + std::vector files; + json body = json::parse(req.body); + + json body_parsed = anthropic_params_from_json( + body, + ctx_server.oai_parser_opt, + files); + + json prompt = body_parsed.at("prompt"); + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true); + + res->ok({{"input_tokens", static_cast(tokens.size())}}); + return res; + }; + // same with handle_chat_completions, but without inference part server_http_context::handler_t post_apply_template = [this](const server_http_req & req) { auto res = std::make_unique(ctx_server); @@ -3352,7 +3384,11 @@ struct server_routes { } // next responses are streamed - res->data = format_sse(first_result->to_json()); // to be sent immediately + if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + res->data = format_anthropic_sse(first_result->to_json()); + } else { + res->data = format_sse(first_result->to_json()); // to be sent immediately + } res->status = 200; res->content_type = "text/event-stream"; res->next = [res_this = res.get(), oaicompat, &should_stop](std::string & output) -> bool { @@ -3372,7 +3408,10 @@ struct server_routes { // check if there is more data if (!rd.has_next()) { - if (oaicompat != OAICOMPAT_TYPE_NONE) { + if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + // Anthropic doesn't send [DONE], message_stop was already sent + output = ""; + } else if (oaicompat != OAICOMPAT_TYPE_NONE) { output = "data: [DONE]\n\n"; } else { output = ""; @@ -3391,7 +3430,16 @@ struct server_routes { // send the results json res_json = result->to_json(); if (result->is_error()) { - output = format_sse(json {{ "error", res_json }}); + if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + json error_event = json::object(); + error_event["event"] = "error"; + error_event["data"] = res_json; + json error_array = json::array(); + error_array.push_back(error_event); + output = format_anthropic_sse(error_array); + } else { + output = format_sse(json {{ "error", res_json }}); + } SRV_DBG("%s", "error received during streaming, terminating stream\n"); return false; // terminate on error } else { @@ -3399,7 +3447,11 @@ struct server_routes { dynamic_cast(result.get()) != nullptr || dynamic_cast(result.get()) != nullptr ); - output = format_sse(res_json); + if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + output = format_anthropic_sse(res_json); + } else { + output = format_sse(res_json); + } } // has next data, continue @@ -3712,6 +3764,8 @@ int main(int argc, char ** argv) { ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API + ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); diff --git a/tools/server/tests/unit/test_anthropic_api.py b/tools/server/tests/unit/test_anthropic_api.py new file mode 100644 index 00000000000..23466b55991 --- /dev/null +++ b/tools/server/tests/unit/test_anthropic_api.py @@ -0,0 +1,838 @@ +#!/usr/bin/env python3 +import pytest +import base64 +import requests +import os + +# ensure grandparent path is in sys.path +from pathlib import Path +import sys +path = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(path)) + +from utils import * + +server: ServerProcess + +TIMEOUT_START_SLOW = 15 * 60 +TIMEOUT_HTTP_REQUEST = 60 + + +def get_test_image_base64() -> str: + """Get a test image in base64 format""" + # Use the same test image as test_vision_api.py + IMG_URL = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png" + response = requests.get(IMG_URL) + response.raise_for_status() + return base64.b64encode(response.content).decode("utf-8") + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + server.model_alias = "tinyllama-2-anthropic" + server.server_port = 8082 + server.n_slots = 1 + server.n_ctx = 8192 + server.n_batch = 2048 + + +@pytest.fixture +def vision_server(): + """Separate fixture for vision tests that require multimodal support""" + global server + server = ServerPreset.tinygemma3() + server.offline = False # Allow downloading the model + server.model_alias = "tinygemma3-anthropic" + server.server_port = 8083 # Different port to avoid conflicts + server.n_slots = 1 + return server + + +# Basic message tests + +def test_anthropic_messages_basic(): + """Test basic Anthropic messages endpoint""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "messages": [ + {"role": "user", "content": "Say hello"} + ] + }) + + assert res.status_code == 200, f"Expected 200, got {res.status_code}" + assert res.body["type"] == "message", f"Expected type 'message', got {res.body.get('type')}" + assert res.body["role"] == "assistant", f"Expected role 'assistant', got {res.body.get('role')}" + assert "content" in res.body, "Missing 'content' field" + assert isinstance(res.body["content"], list), "Content should be an array" + assert len(res.body["content"]) > 0, "Content array should not be empty" + assert res.body["content"][0]["type"] == "text", "First content block should be text" + assert "text" in res.body["content"][0], "Text content block missing 'text' field" + assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}" + assert "usage" in res.body, "Missing 'usage' field" + assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens" + assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens" + assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer" + assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer" + assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens" + # Anthropic API should NOT include timings + assert "timings" not in res.body, "Anthropic API should not include timings field" + + +def test_anthropic_messages_with_system(): + """Test messages with system prompt""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "system": "You are a helpful assistant.", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + assert len(res.body["content"]) > 0 + + +def test_anthropic_messages_multipart_content(): + """Test messages with multipart content blocks""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is"}, + {"type": "text", "text": " the answer?"} + ] + } + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +def test_anthropic_messages_conversation(): + """Test multi-turn conversation""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "How are you?"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +# Streaming tests + +def test_anthropic_messages_streaming(): + """Test streaming messages""" + server.start() + + res = server.make_stream_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 30, + "messages": [ + {"role": "user", "content": "Say hello"} + ], + "stream": True + }) + + events = [] + for data in res: + # Each event should have type and other fields + assert "type" in data, f"Missing 'type' in event: {data}" + events.append(data) + + # Verify event sequence + event_types = [e["type"] for e in events] + assert "message_start" in event_types, "Missing message_start event" + assert "content_block_start" in event_types, "Missing content_block_start event" + assert "content_block_delta" in event_types, "Missing content_block_delta event" + assert "content_block_stop" in event_types, "Missing content_block_stop event" + assert "message_delta" in event_types, "Missing message_delta event" + assert "message_stop" in event_types, "Missing message_stop event" + + # Check message_start structure + message_start = next(e for e in events if e["type"] == "message_start") + assert "message" in message_start, "message_start missing 'message' field" + assert message_start["message"]["type"] == "message" + assert message_start["message"]["role"] == "assistant" + assert message_start["message"]["content"] == [] + assert "usage" in message_start["message"] + assert message_start["message"]["usage"]["input_tokens"] > 0 + + # Check content_block_start + block_start = next(e for e in events if e["type"] == "content_block_start") + assert "index" in block_start, "content_block_start missing 'index'" + assert block_start["index"] == 0, "First content block should be at index 0" + assert "content_block" in block_start + assert block_start["content_block"]["type"] == "text" + + # Check content_block_delta + deltas = [e for e in events if e["type"] == "content_block_delta"] + assert len(deltas) > 0, "Should have at least one content_block_delta" + for delta in deltas: + assert "index" in delta + assert "delta" in delta + assert delta["delta"]["type"] == "text_delta" + assert "text" in delta["delta"] + + # Check content_block_stop + block_stop = next(e for e in events if e["type"] == "content_block_stop") + assert "index" in block_stop + assert block_stop["index"] == 0 + + # Check message_delta + message_delta = next(e for e in events if e["type"] == "message_delta") + assert "delta" in message_delta + assert "stop_reason" in message_delta["delta"] + assert message_delta["delta"]["stop_reason"] in ["end_turn", "max_tokens"] + assert "usage" in message_delta + assert message_delta["usage"]["output_tokens"] > 0 + + # Check message_stop + message_stop = next(e for e in events if e["type"] == "message_stop") + # message_stop should NOT have timings for Anthropic API + assert "timings" not in message_stop, "Anthropic streaming should not include timings" + + +# Token counting tests + +def test_anthropic_count_tokens(): + """Test token counting endpoint""" + server.start() + + res = server.make_request("POST", "/v1/messages/count_tokens", data={ + "model": "test", + "messages": [ + {"role": "user", "content": "Hello world"} + ] + }) + + assert res.status_code == 200 + assert "input_tokens" in res.body + assert isinstance(res.body["input_tokens"], int) + assert res.body["input_tokens"] > 0 + # Should only have input_tokens, no other fields + assert "output_tokens" not in res.body + + +def test_anthropic_count_tokens_with_system(): + """Test token counting with system prompt""" + server.start() + + res = server.make_request("POST", "/v1/messages/count_tokens", data={ + "model": "test", + "system": "You are a helpful assistant.", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["input_tokens"] > 0 + + +def test_anthropic_count_tokens_no_max_tokens(): + """Test that count_tokens doesn't require max_tokens""" + server.start() + + # max_tokens is NOT required for count_tokens + res = server.make_request("POST", "/v1/messages/count_tokens", data={ + "model": "test", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert "input_tokens" in res.body + + +# Tool use tests + +@pytest.mark.slow +def test_anthropic_tool_use_basic(): + """Test basic tool use""" + server.jinja = True + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 200, + "tools": [{ + "name": "get_weather", + "description": "Get the current weather in a location", + "input_schema": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name" + } + }, + "required": ["location"] + } + }], + "messages": [ + {"role": "user", "content": "What's the weather in Paris?"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + assert len(res.body["content"]) > 0 + + # Check if model used the tool (it might not always, depending on the model) + content_types = [block.get("type") for block in res.body["content"]] + + if "tool_use" in content_types: + # Model used the tool + assert res.body["stop_reason"] == "tool_use" + + # Find the tool_use block + tool_block = next(b for b in res.body["content"] if b.get("type") == "tool_use") + assert "id" in tool_block + assert "name" in tool_block + assert tool_block["name"] == "get_weather" + assert "input" in tool_block + assert isinstance(tool_block["input"], dict) + + +@pytest.mark.slow +def test_anthropic_tool_result(): + """Test sending tool results back + + This test verifies that tool_result blocks are properly converted to + role="tool" messages internally. Without proper conversion, this would + fail with a 500 error: "unsupported content[].type" because tool_result + blocks would remain in the user message content array. + """ + server.jinja = True + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 100, + "messages": [ + {"role": "user", "content": "What's the weather?"}, + { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "test123", + "name": "get_weather", + "input": {"location": "Paris"} + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "test123", + "content": "The weather is sunny, 25°C" + } + ] + } + ] + }) + + # This would be 500 with the old bug where tool_result blocks weren't converted + assert res.status_code == 200 + assert res.body["type"] == "message" + # Model should respond to the tool result + assert len(res.body["content"]) > 0 + assert res.body["content"][0]["type"] == "text" + + +@pytest.mark.slow +def test_anthropic_tool_result_with_text(): + """Test tool result mixed with text content + + This tests the edge case where a user message contains both text and + tool_result blocks. The server must properly split these into separate + messages: a user message with text, followed by tool messages. + Without proper handling, this would fail with 500: "unsupported content[].type" + """ + server.jinja = True + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 100, + "messages": [ + {"role": "user", "content": "What's the weather?"}, + { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "tool_1", + "name": "get_weather", + "input": {"location": "Paris"} + } + ] + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Here are the results:"}, + { + "type": "tool_result", + "tool_use_id": "tool_1", + "content": "Sunny, 25°C" + } + ] + } + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + assert len(res.body["content"]) > 0 + + +@pytest.mark.slow +def test_anthropic_tool_result_error(): + """Test tool result with error flag""" + server.jinja = True + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 100, + "messages": [ + {"role": "user", "content": "Get the weather"}, + { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "test123", + "name": "get_weather", + "input": {"location": "InvalidCity"} + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "test123", + "is_error": True, + "content": "City not found" + } + ] + } + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +@pytest.mark.slow +def test_anthropic_tool_streaming(): + """Test streaming with tool use""" + server.jinja = True + server.start() + + res = server.make_stream_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 200, + "stream": True, + "tools": [{ + "name": "calculator", + "description": "Calculate math", + "input_schema": { + "type": "object", + "properties": { + "expression": {"type": "string"} + }, + "required": ["expression"] + } + }], + "messages": [ + {"role": "user", "content": "Calculate 2+2"} + ] + }) + + events = [] + for data in res: + events.append(data) + + event_types = [e["type"] for e in events] + + # Should have basic events + assert "message_start" in event_types + assert "message_stop" in event_types + + # If tool was used, check for proper tool streaming + if any(e.get("type") == "content_block_start" and + e.get("content_block", {}).get("type") == "tool_use" + for e in events): + # Find tool use block start + tool_starts = [e for e in events if + e.get("type") == "content_block_start" and + e.get("content_block", {}).get("type") == "tool_use"] + + assert len(tool_starts) > 0, "Should have tool_use content_block_start" + + # Check index is correct (should be 0 if no text, 1 if there's text) + tool_start = tool_starts[0] + assert "index" in tool_start + assert tool_start["content_block"]["type"] == "tool_use" + assert "name" in tool_start["content_block"] + + +# Vision/multimodal tests + +def test_anthropic_vision_format_accepted(): + """Test that Anthropic vision format is accepted (format validation only)""" + server.start() + + # Small 1x1 red PNG image in base64 + red_pixel_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==" + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 10, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": red_pixel_png + } + }, + { + "type": "text", + "text": "What is this?" + } + ] + } + ] + }) + + # Server accepts the format but tinyllama doesn't support images + # So it should return 500 with clear error message about missing mmproj + assert res.status_code == 500 + assert "image input is not supported" in res.body.get("error", {}).get("message", "").lower() + + +@pytest.mark.slow +@pytest.mark.skipif( + "SLOW_TESTS" not in os.environ, + reason="Vision test requires tinygemma3 model download (~100MB) - use SLOW_TESTS=1 to run" +) +def test_anthropic_vision_base64_with_multimodal_model(vision_server): + """ + Test vision with base64 image using Anthropic format with multimodal model + + NOTE: This test requires downloading: + - tinygemma3 model (~100MB) + - mmproj file for vision support + + To run this test: + SLOW_TESTS=1 ./tests.sh unit/test_anthropic_api.py::test_anthropic_vision_base64_with_multimodal_model -v + """ + global server + server = vision_server + server.start() + + # Get test image in base64 format + image_base64 = get_test_image_base64() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 10, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": image_base64 + } + }, + { + "type": "text", + "text": "What is this:\n" + } + ] + } + ] + }) + + assert res.status_code == 200, f"Expected 200, got {res.status_code}: {res.body}" + assert res.body["type"] == "message" + assert len(res.body["content"]) > 0 + assert res.body["content"][0]["type"] == "text" + # The model should generate some response about the image + assert len(res.body["content"][0]["text"]) > 0 + + +# Parameter tests + +def test_anthropic_stop_sequences(): + """Test stop_sequences parameter""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 100, + "stop_sequences": ["\n", "END"], + "messages": [ + {"role": "user", "content": "Count to 10"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +def test_anthropic_temperature(): + """Test temperature parameter""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "temperature": 0.5, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +def test_anthropic_top_p(): + """Test top_p parameter""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "top_p": 0.9, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +def test_anthropic_top_k(): + """Test top_k parameter (llama.cpp specific)""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "top_k": 40, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +# Error handling tests + +def test_anthropic_missing_messages(): + """Test error when messages are missing""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50 + # missing "messages" field + }) + + # Should return an error (400 or 500) + assert res.status_code >= 400 + + +def test_anthropic_empty_messages(): + """Test permissive handling of empty messages array""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "messages": [] + }) + + # Server is permissive and accepts empty messages (provides defaults) + # This matches the permissive validation design choice + assert res.status_code == 200 + assert res.body["type"] == "message" + + +# Content block index tests + +@pytest.mark.slow +def test_anthropic_streaming_content_block_indices(): + """Test that content block indices are correct in streaming""" + server.jinja = True + server.start() + + # Request that might produce both text and tool use + res = server.make_stream_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 200, + "stream": True, + "tools": [{ + "name": "test_tool", + "description": "A test tool", + "input_schema": { + "type": "object", + "properties": { + "param": {"type": "string"} + }, + "required": ["param"] + } + }], + "messages": [ + {"role": "user", "content": "Use the test tool"} + ] + }) + + events = [] + for data in res: + events.append(data) + + # Check content_block_start events have sequential indices + block_starts = [e for e in events if e.get("type") == "content_block_start"] + if len(block_starts) > 1: + # If there are multiple blocks, indices should be sequential + indices = [e["index"] for e in block_starts] + expected_indices = list(range(len(block_starts))) + assert indices == expected_indices, f"Expected indices {expected_indices}, got {indices}" + + # Check content_block_stop events match the starts + block_stops = [e for e in events if e.get("type") == "content_block_stop"] + start_indices = set(e["index"] for e in block_starts) + stop_indices = set(e["index"] for e in block_stops) + assert start_indices == stop_indices, "content_block_stop indices should match content_block_start indices" + + +# Extended features tests + +@pytest.mark.slow +def test_anthropic_thinking(): + """Test extended thinking parameter""" + server.jinja = True + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 100, + "thinking": { + "type": "enabled", + "budget_tokens": 50 + }, + "messages": [ + {"role": "user", "content": "What is 2+2?"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +def test_anthropic_metadata(): + """Test metadata parameter""" + server.start() + + res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "metadata": { + "user_id": "test_user_123" + }, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert res.status_code == 200 + assert res.body["type"] == "message" + + +# Compatibility tests + +def test_anthropic_vs_openai_different_response_format(): + """Verify Anthropic format is different from OpenAI format""" + server.start() + + # Make OpenAI request + openai_res = server.make_request("POST", "/v1/chat/completions", data={ + "model": "test", + "max_tokens": 50, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + # Make Anthropic request + anthropic_res = server.make_request("POST", "/v1/messages", data={ + "model": "test", + "max_tokens": 50, + "messages": [ + {"role": "user", "content": "Hello"} + ] + }) + + assert openai_res.status_code == 200 + assert anthropic_res.status_code == 200 + + # OpenAI has "object", Anthropic has "type" + assert "object" in openai_res.body + assert "type" in anthropic_res.body + assert openai_res.body["object"] == "chat.completion" + assert anthropic_res.body["type"] == "message" + + # OpenAI has "choices", Anthropic has "content" + assert "choices" in openai_res.body + assert "content" in anthropic_res.body + + # Different usage field names + assert "prompt_tokens" in openai_res.body["usage"] + assert "input_tokens" in anthropic_res.body["usage"] + assert "completion_tokens" in openai_res.body["usage"] + assert "output_tokens" in anthropic_res.body["usage"] From f7d463d83d7e01c1820a28c75e6b46156dbcb0c5 Mon Sep 17 00:00:00 2001 From: noname Date: Wed, 26 Nov 2025 16:49:10 +0100 Subject: [PATCH 2/9] remove -@pytest.mark.slow from tool calling/jinja tests --- tools/server/tests/unit/test_anthropic_api.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/server/tests/unit/test_anthropic_api.py b/tools/server/tests/unit/test_anthropic_api.py index 23466b55991..54b958ed467 100644 --- a/tools/server/tests/unit/test_anthropic_api.py +++ b/tools/server/tests/unit/test_anthropic_api.py @@ -269,7 +269,6 @@ def test_anthropic_count_tokens_no_max_tokens(): # Tool use tests -@pytest.mark.slow def test_anthropic_tool_use_basic(): """Test basic tool use""" server.jinja = True @@ -317,7 +316,6 @@ def test_anthropic_tool_use_basic(): assert isinstance(tool_block["input"], dict) -@pytest.mark.slow def test_anthropic_tool_result(): """Test sending tool results back @@ -366,7 +364,6 @@ def test_anthropic_tool_result(): assert res.body["content"][0]["type"] == "text" -@pytest.mark.slow def test_anthropic_tool_result_with_text(): """Test tool result mixed with text content @@ -413,7 +410,6 @@ def test_anthropic_tool_result_with_text(): assert len(res.body["content"]) > 0 -@pytest.mark.slow def test_anthropic_tool_result_error(): """Test tool result with error flag""" server.jinja = True @@ -453,7 +449,6 @@ def test_anthropic_tool_result_error(): assert res.body["type"] == "message" -@pytest.mark.slow def test_anthropic_tool_streaming(): """Test streaming with tool use""" server.jinja = True @@ -705,7 +700,6 @@ def test_anthropic_empty_messages(): # Content block index tests -@pytest.mark.slow def test_anthropic_streaming_content_block_indices(): """Test that content block indices are correct in streaming""" server.jinja = True @@ -753,7 +747,6 @@ def test_anthropic_streaming_content_block_indices(): # Extended features tests -@pytest.mark.slow def test_anthropic_thinking(): """Test extended thinking parameter""" server.jinja = True From 32b65f02f615014b6e542a0eeefa949b837179ea Mon Sep 17 00:00:00 2001 From: noname Date: Thu, 27 Nov 2025 09:21:41 +0100 Subject: [PATCH 3/9] server : remove unused code and slow/skip on test_anthropic_vision_base64_with_multimodal_model in test_anthropic_api.py --- tools/server/tests/unit/test_anthropic_api.py | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/tools/server/tests/unit/test_anthropic_api.py b/tools/server/tests/unit/test_anthropic_api.py index 54b958ed467..d55dd1d9454 100644 --- a/tools/server/tests/unit/test_anthropic_api.py +++ b/tools/server/tests/unit/test_anthropic_api.py @@ -2,21 +2,11 @@ import pytest import base64 import requests -import os - -# ensure grandparent path is in sys.path -from pathlib import Path -import sys -path = Path(__file__).resolve().parents[1] -sys.path.insert(0, str(path)) from utils import * server: ServerProcess -TIMEOUT_START_SLOW = 15 * 60 -TIMEOUT_HTTP_REQUEST = 60 - def get_test_image_base64() -> str: """Get a test image in base64 format""" @@ -541,22 +531,8 @@ def test_anthropic_vision_format_accepted(): assert "image input is not supported" in res.body.get("error", {}).get("message", "").lower() -@pytest.mark.slow -@pytest.mark.skipif( - "SLOW_TESTS" not in os.environ, - reason="Vision test requires tinygemma3 model download (~100MB) - use SLOW_TESTS=1 to run" -) def test_anthropic_vision_base64_with_multimodal_model(vision_server): - """ - Test vision with base64 image using Anthropic format with multimodal model - - NOTE: This test requires downloading: - - tinygemma3 model (~100MB) - - mmproj file for vision support - - To run this test: - SLOW_TESTS=1 ./tests.sh unit/test_anthropic_api.py::test_anthropic_vision_base64_with_multimodal_model -v - """ + """Test vision with base64 image using Anthropic format with multimodal model""" global server server = vision_server server.start() From c922b4a0445682d078fce9e6045b757af1e5655e Mon Sep 17 00:00:00 2001 From: noname Date: Thu, 27 Nov 2025 09:26:00 +0100 Subject: [PATCH 4/9] server : removed redundant n field logic in anthropic_params_from_json --- tools/server/server-common.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 523004977fa..7214ee4ee3b 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1464,18 +1464,11 @@ json anthropic_params_from_json( llama_params["stop"].push_back(stop); } - // Handle "n" field - int n_choices = json_value(body, "n", 1); - if (n_choices != 1) { - throw std::runtime_error("Only one completion choice is allowed"); - } - // Copy remaining properties to llama_params // This allows user to use llama.cpp-specific params like "mirostat", ... via Anthropic endpoint. // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp for (const auto & item : body.items()) { - // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" - if (!llama_params.contains(item.key()) || item.key() == "n_predict") { + if (!llama_params.contains(item.key())) { llama_params[item.key()] = item.value(); } } From f388e35ce5427e9d457499ac83632bf80f5e9d0d Mon Sep 17 00:00:00 2001 From: noname Date: Thu, 27 Nov 2025 10:28:44 +0100 Subject: [PATCH 5/9] server : use single error object instead of error_array in streaming response handler for /v1/chat/completions and use unordered_set instead of set in to_json_anthropic_stream() --- tools/server/server-task.cpp | 4 +--- tools/server/server.cpp | 10 ++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index aff9bf66b71..b18239211f6 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -7,8 +7,6 @@ #include "sampling.h" #include "json-schema-to-grammar.h" -#include - using json = nlohmann::ordered_json; // @@ -840,7 +838,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() { size_t num_tool_calls = oaicompat_msg.tool_calls.size(); bool text_block_started = false; - std::set tool_calls_started; + std::unordered_set tool_calls_started; for (const auto & diff : oaicompat_msg_diffs) { if (!diff.content_delta.empty()) { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index dd36c400821..c9027bacef2 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3431,12 +3431,10 @@ struct server_routes { json res_json = result->to_json(); if (result->is_error()) { if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { - json error_event = json::object(); - error_event["event"] = "error"; - error_event["data"] = res_json; - json error_array = json::array(); - error_array.push_back(error_event); - output = format_anthropic_sse(error_array); + output = format_anthropic_sse({ + {"event", "error"}, + {"data", res_json}, + }); } else { output = format_sse(json {{ "error", res_json }}); } From 728d4ecf3e7ad15b099b417ee3d59e8527064707 Mon Sep 17 00:00:00 2001 From: noname Date: Thu, 27 Nov 2025 10:58:17 +0100 Subject: [PATCH 6/9] server : refactor Anthropic API to use OAI conversion --- tools/server/server-common.cpp | 439 ++++++++++----------------------- tools/server/server-common.h | 3 + 2 files changed, 131 insertions(+), 311 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 7214ee4ee3b..2e3903b1b45 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1070,48 +1070,10 @@ json oaicompat_chat_params_parse( return llama_params; } -json anthropic_params_from_json( - const json & body_in, /* anthropic messages api json semantics */ - const oaicompat_parser_options & opt, - std::vector & out_files) -{ - json body = body_in; - json llama_params; - - if (body.contains("stop_sequences")) { - llama_params["stop"] = body.at("stop_sequences"); - } else { - llama_params["stop"] = json::array(); - } - - // handle max_tokens (required in Anthropic, but we're permissive) - if (!body.contains("max_tokens")) { - llama_params["n_predict"] = 4096; - } else { - llama_params["n_predict"] = body.at("max_tokens"); - } - - if (body.contains("top_k")) { - llama_params["top_k"] = body.at("top_k"); - } - - if (body.contains("thinking")) { - json thinking = json_value(body, "thinking", json::object()); - std::string thinking_type = json_value(thinking, "type", std::string()); - if (thinking_type == "enabled") { - int budget_tokens = json_value(thinking, "budget_tokens", 10000); - llama_params["thinking_budget_tokens"] = budget_tokens; - } - } - - if (body.contains("metadata")) { - json metadata = json_value(body, "metadata", json::object()); - std::string user_id = json_value(metadata, "user_id", std::string()); - if (!user_id.empty()) { - llama_params["__metadata_user_id"] = user_id; - } - } +json convert_anthropic_to_oai(const json & body) { + json oai_body; + // Convert system prompt json oai_messages = json::array(); auto system_param = json_value(body, "system", json()); if (!system_param.is_null()) { @@ -1133,103 +1095,101 @@ json anthropic_params_from_json( }); } + // Convert messages if (!body.contains("messages")) { throw std::runtime_error("'messages' is required"); } - json & messages = body.at("messages"); - if (!messages.is_array()) { - throw std::runtime_error("Expected 'messages' to be an array"); - } + const json & messages = body.at("messages"); + if (messages.is_array()) { + for (const auto & msg : messages) { + std::string role = json_value(msg, "role", std::string()); - for (auto & msg : messages) { - std::string role = json_value(msg, "role", std::string()); - if (role != "assistant" && !msg.contains("content")) { - throw std::runtime_error("All non-assistant messages must contain 'content'"); - } - if (role == "assistant") { if (!msg.contains("content")) { + if (role == "assistant") { + continue; + } + oai_messages.push_back(msg); continue; } - } - - json & content = msg.at("content"); - if (content.is_string()) { - oai_messages.push_back(msg); - continue; - } + const json & content = msg.at("content"); - if (!content.is_array()) { - throw std::runtime_error("Expected 'content' to be a string or an array"); - } - - json tool_calls = json::array(); - json converted_content = json::array(); - json tool_results = json::array(); - bool has_tool_calls = false; - - for (auto & block : content) { - std::string type = json_value(block, "type", std::string()); - - if (type == "text") { - converted_content.push_back(block); - } else if (type == "image") { - json source = json_value(block, "source", json::object()); - std::string source_type = json_value(source, "type", std::string()); + if (content.is_string()) { + oai_messages.push_back(msg); + continue; + } - if (source_type == "base64") { - std::string media_type = json_value(source, "media_type", std::string("image/jpeg")); - std::string data = json_value(source, "data", std::string()); + if (!content.is_array()) { + oai_messages.push_back(msg); + continue; + } - converted_content.push_back({ - {"type", "image_url"}, - {"image_url", { - {"url", "data:" + media_type + ";base64," + data} - }} - }); - } else if (source_type == "url") { - std::string url = json_value(source, "url", std::string()); - converted_content.push_back({ - {"type", "image_url"}, - {"image_url", { - {"url", url} + json tool_calls = json::array(); + json converted_content = json::array(); + json tool_results = json::array(); + bool has_tool_calls = false; + + for (const auto & block : content) { + std::string type = json_value(block, "type", std::string()); + + if (type == "text") { + converted_content.push_back(block); + } else if (type == "image") { + json source = json_value(block, "source", json::object()); + std::string source_type = json_value(source, "type", std::string()); + + if (source_type == "base64") { + std::string media_type = json_value(source, "media_type", std::string("image/jpeg")); + std::string data = json_value(source, "data", std::string()); + + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", "data:" + media_type + ";base64," + data} + }} + }); + } else if (source_type == "url") { + std::string url = json_value(source, "url", std::string()); + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", url} + }} + }); + } + } else if (type == "tool_use") { + tool_calls.push_back({ + {"id", json_value(block, "id", std::string())}, + {"type", "function"}, + {"function", { + {"name", json_value(block, "name", std::string())}, + {"arguments", json_value(block, "input", json::object()).dump()} }} }); - } - } else if (type == "tool_use") { - tool_calls.push_back({ - {"id", json_value(block, "id", std::string())}, - {"type", "function"}, - {"function", { - {"name", json_value(block, "name", std::string())}, - {"arguments", json_value(block, "input", json::object()).dump()} - }} - }); - has_tool_calls = true; - } else if (type == "tool_result") { - std::string tool_use_id = json_value(block, "tool_use_id", std::string()); - - auto result_content = json_value(block, "content", json()); - std::string result_text; - if (result_content.is_string()) { - result_text = result_content.get(); - } else if (result_content.is_array()) { - for (const auto & c : result_content) { - if (json_value(c, "type", std::string()) == "text") { - result_text += json_value(c, "text", std::string()); + has_tool_calls = true; + } else if (type == "tool_result") { + std::string tool_use_id = json_value(block, "tool_use_id", std::string()); + + auto result_content = json_value(block, "content", json()); + std::string result_text; + if (result_content.is_string()) { + result_text = result_content.get(); + } else if (result_content.is_array()) { + for (const auto & c : result_content) { + if (json_value(c, "type", std::string()) == "text") { + result_text += json_value(c, "text", std::string()); + } } } - } - tool_results.push_back({ - {"role", "tool"}, - {"tool_call_id", tool_use_id}, - {"content", result_text} - }); + tool_results.push_back({ + {"role", "tool"}, + {"tool_call_id", tool_use_id}, + {"content", result_text} + }); + } } - } - if (!tool_results.empty()) { if (!converted_content.empty() || has_tool_calls) { json new_msg = {{"role", role}}; if (!converted_content.empty()) { @@ -1242,30 +1202,21 @@ json anthropic_params_from_json( } oai_messages.push_back(new_msg); } + for (const auto & tool_msg : tool_results) { oai_messages.push_back(tool_msg); } - } else { - if (!converted_content.empty() || has_tool_calls) { - json new_msg = {{"role", role}}; - if (!converted_content.empty()) { - new_msg["content"] = converted_content; - } else if (has_tool_calls) { - new_msg["content"] = ""; - } - if (!tool_calls.empty()) { - new_msg["tool_calls"] = tool_calls; - } - oai_messages.push_back(new_msg); - } } } - json oai_tools = json::array(); + oai_body["messages"] = oai_messages; + + // Convert tools if (body.contains("tools")) { - json & tools = body.at("tools"); + const json & tools = body.at("tools"); if (tools.is_array()) { - for (auto & tool : tools) { + json oai_tools = json::array(); + for (const auto & tool : tools) { oai_tools.push_back({ {"type", "function"}, {"function", { @@ -1275,205 +1226,71 @@ json anthropic_params_from_json( }} }); } + oai_body["tools"] = oai_tools; } } - std::string oai_tool_choice = "auto"; + // Convert tool_choice if (body.contains("tool_choice")) { - json & tc = body.at("tool_choice"); + const json & tc = body.at("tool_choice"); if (tc.is_object()) { std::string type = json_value(tc, "type", std::string()); if (type == "auto") { - oai_tool_choice = "auto"; - } else if (type == "any") { - oai_tool_choice = "required"; - } else if (type == "tool") { - oai_tool_choice = "required"; - } - } - } - - for (auto & msg : oai_messages) { - if (!msg.contains("content")) { - continue; - } - json & content = msg.at("content"); - if (content.is_string() || content.is_null()) { - continue; - } - if (!content.is_array()) { - continue; - } - - for (auto & p : content) { - std::string type = json_value(p, "type", std::string()); - if (type == "image_url") { - if (!opt.allow_image) { - throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); - } - - json image_url = json_value(p, "image_url", json::object()); - std::string url = json_value(image_url, "url", std::string()); - if (string_starts_with(url, "http")) { - // download remote image - common_remote_params params; - params.headers.push_back("User-Agent: llama.cpp/" + build_info); - params.max_size = 1024 * 1024 * 10; // 10MB - params.timeout = 10; // seconds - SRV_INF("downloading image from '%s'\n", url.c_str()); - auto res = common_remote_get_content(url, params); - if (200 <= res.first && res.first < 300) { - SRV_INF("downloaded %ld bytes\n", res.second.size()); - raw_buffer data; - data.insert(data.end(), res.second.begin(), res.second.end()); - out_files.push_back(data); - } else { - throw std::runtime_error("Failed to download image"); - } - } else { - // try to decode base64 image - std::vector parts = string_split(url, /*separator*/ ','); - if (parts.size() != 2) { - throw std::runtime_error("Invalid image_url.url value"); - } else if (!string_starts_with(parts[0], "data:image/")) { - throw std::runtime_error("Invalid image_url.url format: " + parts[0]); - } else if (!string_ends_with(parts[0], "base64")) { - throw std::runtime_error("image_url.url must be base64 encoded"); - } else { - auto base64_data = parts[1]; - auto decoded_data = base64_decode(base64_data); - out_files.push_back(decoded_data); - } - } - - // replace this chunk with a marker - p["type"] = "text"; - p["text"] = mtmd_default_marker(); - p.erase("image_url"); - } else if (type == "input_audio") { - if (!opt.allow_audio) { - throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); - } - - json input_audio = json_value(p, "input_audio", json::object()); - std::string data = json_value(input_audio, "data", std::string()); - std::string format = json_value(input_audio, "format", std::string()); - if (format != "wav" && format != "mp3") { - throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'"); - } - auto decoded_data = base64_decode(data); - out_files.push_back(decoded_data); - - // replace this chunk with a marker - p["type"] = "text"; - p["text"] = mtmd_default_marker(); - p.erase("input_audio"); + oai_body["tool_choice"] = "auto"; + } else if (type == "any" || type == "tool") { + oai_body["tool_choice"] = "required"; } } } - common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(oai_messages); - inputs.tools = common_chat_tools_parse_oaicompat(oai_tools); - inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(oai_tool_choice); - inputs.json_schema = ""; - inputs.grammar = ""; - inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); - inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); - inputs.reasoning_format = opt.reasoning_format; - inputs.enable_thinking = opt.enable_thinking; - - if (opt.enable_thinking && opt.prefill_assistant) { - if (!inputs.messages.empty() && inputs.messages.back().role == "assistant") { - inputs.enable_thinking = false; - } - } - - if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) { - llama_params["parse_tool_calls"] = true; - } - - // merge the template args provided from command line with the args provided in the user request - auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); - inputs.chat_template_kwargs = opt.chat_template_kwargs; - for (const auto & item : chat_template_kwargs_object.items()) { - inputs.chat_template_kwargs[item.key()] = item.value().dump(); + // Convert stop_sequences to stop + if (body.contains("stop_sequences")) { + oai_body["stop"] = body.at("stop_sequences"); } - // parse the "enable_thinking" kwarg to override the default value - auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string("")); - if (enable_thinking_kwarg == "true") { - inputs.enable_thinking = true; - } else if (enable_thinking_kwarg == "false") { - inputs.enable_thinking = false; - } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') { - throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)"); + // Handle max_tokens (required in Anthropic, but we're permissive) + if (body.contains("max_tokens")) { + oai_body["max_tokens"] = body.at("max_tokens"); + } else { + oai_body["max_tokens"] = 4096; } - // if the assistant message appears at the end of list, we do not add end-of-turn token - bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; - common_chat_msg last_message; - if (prefill_assistant_message) { - last_message = inputs.messages.back(); - inputs.messages.pop_back(); - - // sanity check, max one assistant message at the end of the list - if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ - throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + // Pass through common params + for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) { + if (body.contains(key)) { + oai_body[key] = body.at(key); } - - inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; - - if (inputs.enable_thinking) { - throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); - } - - inputs.add_generation_prompt = true; } - // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); - - // Append assistant prefilled message - if (prefill_assistant_message) { - if (!last_message.content_parts.empty()) { - for (auto & p : last_message.content_parts) { - chat_params.prompt += p.text; - } - } else { - chat_params.prompt += last_message.content; + // Handle Anthropic-specific thinking param + if (body.contains("thinking")) { + json thinking = json_value(body, "thinking", json::object()); + std::string thinking_type = json_value(thinking, "type", std::string()); + if (thinking_type == "enabled") { + int budget_tokens = json_value(thinking, "budget_tokens", 10000); + oai_body["thinking_budget_tokens"] = budget_tokens; } } - llama_params["chat_format"] = static_cast(chat_params.format); - llama_params["prompt"] = chat_params.prompt; - if (!chat_params.grammar.empty()) { - llama_params["grammar"] = chat_params.grammar; - } - llama_params["grammar_lazy"] = chat_params.grammar_lazy; - auto grammar_triggers = json::array(); - for (const auto & trigger : chat_params.grammar_triggers) { - server_grammar_trigger ct(trigger); - grammar_triggers.push_back(ct.to_json()); - } - llama_params["grammar_triggers"] = grammar_triggers; - llama_params["preserved_tokens"] = chat_params.preserved_tokens; - llama_params["thinking_forced_open"] = chat_params.thinking_forced_open; - for (const auto & stop : chat_params.additional_stops) { - llama_params["stop"].push_back(stop); - } - - // Copy remaining properties to llama_params - // This allows user to use llama.cpp-specific params like "mirostat", ... via Anthropic endpoint. - // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp - for (const auto & item : body.items()) { - if (!llama_params.contains(item.key())) { - llama_params[item.key()] = item.value(); + // Handle Anthropic-specific metadata param + if (body.contains("metadata")) { + json metadata = json_value(body, "metadata", json::object()); + std::string user_id = json_value(metadata, "user_id", std::string()); + if (!user_id.empty()) { + oai_body["__metadata_user_id"] = user_id; } } - return llama_params; + return oai_body; +} + +json anthropic_params_from_json( + const json & body, + const oaicompat_parser_options & opt, + std::vector & out_files) +{ + json oai_body = convert_anthropic_to_oai(body); + return oaicompat_chat_params_parse(oai_body, opt, out_files); } json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 4b393fd8b25..c759b7145d0 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -294,6 +294,9 @@ json oaicompat_chat_params_parse( const oaicompat_parser_options & opt, std::vector & out_files); +// convert Anthropic Messages API format to OpenAI Chat Completions API format +json convert_anthropic_to_oai(const json & body); + // used by Anthropic /v1/messages endpoint json anthropic_params_from_json( const json & body, /* anthropic messages api json semantics */ From 332356490f70b330b83736c3338a7924a57c13a1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 27 Nov 2025 23:10:37 +0100 Subject: [PATCH 7/9] make sure basic test always go first --- tools/server/tests/conftest.py | 6 ++++++ tools/server/tests/unit/test_basic.py | 6 ------ .../{test_anthropic_api.py => test_compat_anthropic.py} | 0 3 files changed, 6 insertions(+), 6 deletions(-) rename tools/server/tests/unit/{test_anthropic_api.py => test_compat_anthropic.py} (100%) diff --git a/tools/server/tests/conftest.py b/tools/server/tests/conftest.py index 017d1bb841e..c7ed775968b 100644 --- a/tools/server/tests/conftest.py +++ b/tools/server/tests/conftest.py @@ -13,3 +13,9 @@ def stop_server_after_each_test(): ) # copy the set to prevent 'Set changed size during iteration' for server in instances: server.stop() + + +@pytest.fixture(scope="module", autouse=True) +def do_something(): + # this will be run once per test session, before any tests + ServerPreset.load_all() diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 720b136b051..cadaa91849f 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -5,12 +5,6 @@ server = ServerPreset.tinyllama2() -@pytest.fixture(scope="session", autouse=True) -def do_something(): - # this will be run once per test session, before any tests - ServerPreset.load_all() - - @pytest.fixture(autouse=True) def create_server(): global server diff --git a/tools/server/tests/unit/test_anthropic_api.py b/tools/server/tests/unit/test_compat_anthropic.py similarity index 100% rename from tools/server/tests/unit/test_anthropic_api.py rename to tools/server/tests/unit/test_compat_anthropic.py From b13b41fc8f07f8d045b994c5c52dd0f9aad27ca8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 27 Nov 2025 23:14:33 +0100 Subject: [PATCH 8/9] clean up --- tools/server/server-common.cpp | 61 ++++++++++++++----------------- tools/server/server-common.h | 8 +---- tools/server/server-task.cpp | 26 +++++++------- tools/server/server-task.h | 42 +++++++++++----------- tools/server/server.cpp | 65 +++++++++++++++++----------------- 5 files changed, 94 insertions(+), 108 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 2e3903b1b45..0bbc4e858f2 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -593,29 +593,6 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, return prompt_tokens; } -std::string format_anthropic_sse(const json & data) { - std::ostringstream ss; - - auto send_event = [&ss](const json & event_obj) { - if (event_obj.contains("event") && event_obj.contains("data")) { - ss << "event: " << event_obj.at("event").get() << "\n"; - ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; - } else { - ss << "data: " << safe_json_to_str(event_obj) << "\n\n"; - } - }; - - if (data.is_array()) { - for (const auto & event : data) { - send_event(event); - } - } else { - send_event(data); - } - - return ss.str(); -} - size_t validate_utf8(const std::string& text) { size_t len = text.size(); if (len == 0) return 0; @@ -1141,11 +1118,13 @@ json convert_anthropic_to_oai(const json & body) { if (source_type == "base64") { std::string media_type = json_value(source, "media_type", std::string("image/jpeg")); std::string data = json_value(source, "data", std::string()); + std::ostringstream ss; + ss << "data:" << media_type << ";base64," << data; converted_content.push_back({ {"type", "image_url"}, {"image_url", { - {"url", "data:" + media_type + ";base64," + data} + {"url", ss.str()} }} }); } else if (source_type == "url") { @@ -1284,15 +1263,6 @@ json convert_anthropic_to_oai(const json & body) { return oai_body; } -json anthropic_params_from_json( - const json & body, - const oaicompat_parser_options & opt, - std::vector & out_files) -{ - json oai_body = convert_anthropic_to_oai(body); - return oaicompat_chat_params_parse(oai_body, opt, out_files); -} - json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) { json data = json::array(); int32_t n_tokens = 0; @@ -1456,7 +1426,7 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l // format server-sent event (SSE), return the formatted string to send // note: if data is a json array, it will be sent as multiple events, one per item -std::string format_sse(const json & data) { +std::string format_oai_sse(const json & data) { std::ostringstream ss; auto send_single = [&ss](const json & data) { ss << "data: " << @@ -1475,6 +1445,29 @@ std::string format_sse(const json & data) { return ss.str(); } +std::string format_anthropic_sse(const json & data) { + std::ostringstream ss; + + auto send_event = [&ss](const json & event_obj) { + if (event_obj.contains("event") && event_obj.contains("data")) { + ss << "event: " << event_obj.at("event").get() << "\n"; + ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; + } else { + ss << "data: " << safe_json_to_str(event_obj) << "\n\n"; + } + }; + + if (data.is_array()) { + for (const auto & event : data) { + send_event(event); + } + } else { + send_event(data); + } + + return ss.str(); +} + bool is_valid_utf8(const std::string & str) { const unsigned char* bytes = reinterpret_cast(str.data()); const unsigned char* end = bytes + str.length(); diff --git a/tools/server/server-common.h b/tools/server/server-common.h index c759b7145d0..ab8aabbad03 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -297,12 +297,6 @@ json oaicompat_chat_params_parse( // convert Anthropic Messages API format to OpenAI Chat Completions API format json convert_anthropic_to_oai(const json & body); -// used by Anthropic /v1/messages endpoint -json anthropic_params_from_json( - const json & body, /* anthropic messages api json semantics */ - const oaicompat_parser_options & opt, - std::vector & out_files); - // TODO: move it to server-task.cpp json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false); @@ -329,7 +323,7 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l // format server-sent event (SSE), return the formatted string to send // note: if data is a json array, it will be sent as multiple events, one per item -std::string format_sse(const json & data); +std::string format_oai_sse(const json & data); // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index b18239211f6..b447a1ef6da 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -565,17 +565,17 @@ std::vector completion_token_output::str_to_bytes(const std::stri // server_task_result_cmpl_final // json server_task_result_cmpl_final::to_json() { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: + case TASK_RESPONSE_TYPE_OAI_CMPL: return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: + case TASK_RESPONSE_TYPE_OAI_CHAT: return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); - case OAICOMPAT_TYPE_ANTHROPIC: + case TASK_RESPONSE_TYPE_ANTHROPIC: return stream ? to_json_anthropic_stream() : to_json_anthropic(); default: - GGML_ASSERT(false && "Invalid oaicompat_type"); + GGML_ASSERT(false && "Invalid task_response_type"); } } @@ -956,17 +956,17 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() { // server_task_result_cmpl_partial // json server_task_result_cmpl_partial::to_json() { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: + case TASK_RESPONSE_TYPE_OAI_CMPL: return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: + case TASK_RESPONSE_TYPE_OAI_CHAT: return to_json_oaicompat_chat(); - case OAICOMPAT_TYPE_ANTHROPIC: + case TASK_RESPONSE_TYPE_ANTHROPIC: return to_json_anthropic(); default: - GGML_ASSERT(false && "Invalid oaicompat_type"); + GGML_ASSERT(false && "Invalid task_response_type"); } } @@ -1091,7 +1091,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() { // server_task_result_embd // json server_task_result_embd::to_json() { - return oaicompat == OAICOMPAT_TYPE_EMBEDDING + return res_type == TASK_RESPONSE_TYPE_OAI_EMBD ? to_json_oaicompat() : to_json_non_oaicompat(); } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index b96c00a96a5..a22d7cab116 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -27,12 +27,12 @@ enum server_task_type { }; // TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common -enum oaicompat_type { - OAICOMPAT_TYPE_NONE, - OAICOMPAT_TYPE_CHAT, - OAICOMPAT_TYPE_COMPLETION, - OAICOMPAT_TYPE_EMBEDDING, - OAICOMPAT_TYPE_ANTHROPIC, +enum task_response_type { + TASK_RESPONSE_TYPE_NONE, // llama.cpp native format + TASK_RESPONSE_TYPE_OAI_CHAT, + TASK_RESPONSE_TYPE_OAI_CMPL, + TASK_RESPONSE_TYPE_OAI_EMBD, + TASK_RESPONSE_TYPE_ANTHROPIC, }; enum stop_type { @@ -67,9 +67,9 @@ struct task_params { struct common_params_sampling sampling; struct common_params_speculative speculative; - // OAI-compat fields + // response formatting bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; std::string oaicompat_model; std::string oaicompat_cmpl_id; common_chat_syntax oaicompat_chat_syntax; @@ -228,12 +228,12 @@ struct server_task_result_cmpl_final : server_task_result { task_params generation_params; - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_msg oaicompat_msg; + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + common_chat_msg oaicompat_msg; std::vector oaicompat_msg_diffs; @@ -275,11 +275,11 @@ struct server_task_result_cmpl_partial : server_task_result { result_timings timings; result_prompt_progress progress; - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; std::vector oaicompat_msg_diffs; virtual int get_index() override { @@ -307,8 +307,8 @@ struct server_task_result_embd : server_task_result { int32_t n_tokens; - // OAI-compat fields - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + // response formatting + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; virtual int get_index() override { return index; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index c9027bacef2..05bbe648c1d 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1255,7 +1255,7 @@ struct server_context { res->post_sampling_probs = slot.task->params.post_sampling_probs; res->verbose = slot.task->params.verbose; - res->oaicompat = slot.task->params.oaicompat; + res->res_type = slot.task->params.res_type; res->oaicompat_model = slot.task->params.oaicompat_model; res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; @@ -1297,7 +1297,7 @@ struct server_context { res->verbose = slot.task->params.verbose; res->stream = slot.task->params.stream; res->include_usage = slot.task->params.include_usage; - res->oaicompat = slot.task->params.oaicompat; + res->res_type = slot.task->params.res_type; res->oaicompat_model = slot.task->params.oaicompat_model; res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs); @@ -1328,7 +1328,7 @@ struct server_context { res->id = slot.task->id; res->index = slot.task->index; res->n_tokens = slot.task->n_tokens(); - res->oaicompat = slot.task->params.oaicompat; + res->res_type = slot.task->params.res_type; const int n_embd = llama_model_n_embd(model); @@ -2951,7 +2951,7 @@ struct server_routes { data, files, req.should_stop, - OAICOMPAT_TYPE_NONE); // infill is not OAI compatible + TASK_RESPONSE_TYPE_NONE); // infill is not OAI compatible }; server_http_context::handler_t post_completions = [this](const server_http_req & req) { @@ -2962,7 +2962,7 @@ struct server_routes { body, files, req.should_stop, - OAICOMPAT_TYPE_NONE); + TASK_RESPONSE_TYPE_NONE); }; server_http_context::handler_t post_completions_oai = [this](const server_http_req & req) { @@ -2973,7 +2973,7 @@ struct server_routes { body, files, req.should_stop, - OAICOMPAT_TYPE_COMPLETION); + TASK_RESPONSE_TYPE_OAI_CMPL); }; server_http_context::handler_t post_chat_completions = [this](const server_http_req & req) { @@ -2988,13 +2988,13 @@ struct server_routes { body_parsed, files, req.should_stop, - OAICOMPAT_TYPE_CHAT); + TASK_RESPONSE_TYPE_OAI_CHAT); }; server_http_context::handler_t post_anthropic_messages = [this](const server_http_req & req) { std::vector files; - json body = json::parse(req.body); - json body_parsed = anthropic_params_from_json( + json body = convert_anthropic_to_oai(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( body, ctx_server.oai_parser_opt, files); @@ -3003,15 +3003,14 @@ struct server_routes { body_parsed, files, req.should_stop, - OAICOMPAT_TYPE_ANTHROPIC); + TASK_RESPONSE_TYPE_ANTHROPIC); }; server_http_context::handler_t post_anthropic_count_tokens = [this](const server_http_req & req) { auto res = std::make_unique(ctx_server); std::vector files; - json body = json::parse(req.body); - - json body_parsed = anthropic_params_from_json( + json body = convert_anthropic_to_oai(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( body, ctx_server.oai_parser_opt, files); @@ -3139,11 +3138,11 @@ struct server_routes { }; server_http_context::handler_t post_embeddings = [this](const server_http_req & req) { - return handle_embeddings_impl(req, OAICOMPAT_TYPE_NONE); + return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_NONE); }; server_http_context::handler_t post_embeddings_oai = [this](const server_http_req & req) { - return handle_embeddings_impl(req, OAICOMPAT_TYPE_EMBEDDING); + return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_OAI_EMBD); }; server_http_context::handler_t post_rerank = [this](const server_http_req & req) { @@ -3294,7 +3293,7 @@ struct server_routes { const json & data, const std::vector & files, const std::function & should_stop, - oaicompat_type oaicompat) { + task_response_type res_type) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); auto res = std::make_unique(ctx_server); @@ -3311,7 +3310,7 @@ struct server_routes { // process prompt std::vector inputs; - if (oaicompat && ctx_server.mctx != nullptr) { + if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) { // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below. inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); } else { @@ -3333,8 +3332,8 @@ struct server_routes { task.id_slot = json_value(data, "id_slot", -1); // OAI-compat - task.params.oaicompat = oaicompat; - task.params.oaicompat_cmpl_id = completion_id; + task.params.res_type = res_type; + task.params.oaicompat_cmpl_id = completion_id; // oaicompat_model is already populated by params_from_json_cmpl tasks.push_back(std::move(task)); @@ -3384,14 +3383,14 @@ struct server_routes { } // next responses are streamed - if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { res->data = format_anthropic_sse(first_result->to_json()); } else { - res->data = format_sse(first_result->to_json()); // to be sent immediately + res->data = format_oai_sse(first_result->to_json()); // to be sent immediately } res->status = 200; res->content_type = "text/event-stream"; - res->next = [res_this = res.get(), oaicompat, &should_stop](std::string & output) -> bool { + res->next = [res_this = res.get(), res_type, &should_stop](std::string & output) -> bool { if (should_stop()) { SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); return false; // should_stop condition met @@ -3408,10 +3407,10 @@ struct server_routes { // check if there is more data if (!rd.has_next()) { - if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { // Anthropic doesn't send [DONE], message_stop was already sent output = ""; - } else if (oaicompat != OAICOMPAT_TYPE_NONE) { + } else if (res_type != TASK_RESPONSE_TYPE_NONE) { output = "data: [DONE]\n\n"; } else { output = ""; @@ -3430,13 +3429,13 @@ struct server_routes { // send the results json res_json = result->to_json(); if (result->is_error()) { - if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { output = format_anthropic_sse({ {"event", "error"}, {"data", res_json}, }); } else { - output = format_sse(json {{ "error", res_json }}); + output = format_oai_sse(json {{ "error", res_json }}); } SRV_DBG("%s", "error received during streaming, terminating stream\n"); return false; // terminate on error @@ -3445,10 +3444,10 @@ struct server_routes { dynamic_cast(result.get()) != nullptr || dynamic_cast(result.get()) != nullptr ); - if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { output = format_anthropic_sse(res_json); } else { - output = format_sse(res_json); + output = format_oai_sse(res_json); } } @@ -3557,14 +3556,14 @@ struct server_routes { return res; } - std::unique_ptr handle_embeddings_impl(const server_http_req & req, oaicompat_type oaicompat) { + std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type) { auto res = std::make_unique(ctx_server); if (!ctx_server.params_base.embedding) { res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return res; } - if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + if (res_type != TASK_RESPONSE_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); return res; } @@ -3576,7 +3575,7 @@ struct server_routes { if (body.count("input") != 0) { prompt = body.at("input"); } else if (body.contains("content")) { - oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible + res_type = TASK_RESPONSE_TYPE_NONE; // "content" field is not OAI compatible prompt = body.at("content"); } else { res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); @@ -3624,7 +3623,7 @@ struct server_routes { task.tokens = std::move(tokenized_prompts[i]); // OAI-compat - task.params.oaicompat = oaicompat; + task.params.res_type = res_type; task.params.embd_normalize = embd_normalize; tasks.push_back(std::move(task)); @@ -3649,7 +3648,7 @@ struct server_routes { } // write JSON response - json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING + json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); res->ok(root); From 1381dedcd628beaa39aec8b095a56a178a0c92ce Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 27 Nov 2025 23:19:36 +0100 Subject: [PATCH 9/9] clean up api key check, add test --- tools/server/server-http.cpp | 22 +++++++++++----------- tools/server/tests/unit/test_security.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index fe532090100..622505714cf 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -137,21 +137,21 @@ bool server_http_context::init(const common_params & params) { } // Check for API key in the Authorization header - auto auth_header = req.get_header_value("Authorization"); + std::string req_api_key = req.get_header_value("Authorization"); + if (req_api_key.empty()) { + // retry with anthropic header + req_api_key = req.get_header_value("X-Api-Key"); + } + // remove the "Bearer " prefix if needed std::string prefix = "Bearer "; - if (auth_header.substr(0, prefix.size()) == prefix) { - std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) { - return true; // API key is valid - } + if (req_api_key.substr(0, prefix.size()) == prefix) { + req_api_key = req_api_key.substr(prefix.size()); } - // Check for API key in the x-api-key header - auto x_api_key_header = req.get_header_value("X-Api-Key"); - - if (std::find(api_keys.begin(), api_keys.end(), x_api_key_header) != api_keys.end()) { - return true; // API key is valid + // validate the API key + if (std::find(api_keys.begin(), api_keys.end(), req_api_key) != api_keys.end()) { + return true; // API key is valid } // API key is invalid or not provided diff --git a/tools/server/tests/unit/test_security.py b/tools/server/tests/unit/test_security.py index 0e11580553a..e160a8e6d30 100644 --- a/tools/server/tests/unit/test_security.py +++ b/tools/server/tests/unit/test_security.py @@ -49,6 +49,19 @@ def test_correct_api_key(): assert "content" in res.body +def test_correct_api_key_anthropic_header(): + global server + server.start() + res = server.make_request("POST", "/completions", data={ + "prompt": "I believe the meaning of life is", + }, headers={ + "X-Api-Key": TEST_API_KEY, + }) + assert res.status_code == 200 + assert "error" not in res.body + assert "content" in res.body + + def test_openai_library_correct_api_key(): global server server.start()