From 45b2fe194c6fd116fdaf7c0e1c677187c84aea5b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 18:48:40 +0100 Subject: [PATCH 01/21] server: split HTTP into its own interface --- tools/server/server-http.h | 362 ++++++++++ tools/server/server.cpp | 1348 +++++------------------------------- tools/server/utils.hpp | 22 +- 3 files changed, 531 insertions(+), 1201 deletions(-) create mode 100644 tools/server/server-http.h diff --git a/tools/server/server-http.h b/tools/server/server-http.h new file mode 100644 index 0000000000000..5d93b94b6ce95 --- /dev/null +++ b/tools/server/server-http.h @@ -0,0 +1,362 @@ +#pragma once + +#include "utils.hpp" +#include "common.h" + +#include +#include +#include + +// auto generated files (see README.md for details) +#include "index.html.gz.hpp" +#include "loading.html.hpp" + +// generator-like API for HTTP response generation +struct server_http_resgen { + std::string content_type = "application/json; charset=utf-8"; + int status = 200; + std::string data; + + // if is_stream is true, next() will return true until the stream ends + // the data member will contain the next chunk of data to send + // TODO: move this to a virtual function once we have proper polymorphism support + std::function next = nullptr; + bool is_stream() const { + return next != nullptr; + } + + virtual ~server_http_resgen() = default; +}; + +// unique pointer, used by set_chunked_content_provider +// we need to use unique_ptr because httplib requires the stream provider to be stored in heap +using server_http_resgen_ptr = std::unique_ptr; + +struct server_http_request { + std::unordered_map query_params; + json body; + const std::function & should_stop; +}; + +struct server_http_context { + std::thread thread; + std::unique_ptr svr; + std::atomic is_ready = false; + + std::string path_prefix; + std::string hostname; + int port; + + bool init(const common_params & params); + bool start(); + void stop(); + + using handler_t = std::function; + void get(const std::string &, handler_t); + void post(const std::string &, handler_t); +}; + +// implementation details + +static void log_server_request(const httplib::Request & req, const httplib::Response & res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health") { + return; + } + + // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch + + SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + + SRV_DBG("request: %s\n", req.body.c_str()); + SRV_DBG("response: %s\n", res.body.c_str()); +} + +bool server_http_context::init(const common_params & params) { + path_prefix = params.api_prefix; + port = params.port; + hostname = params.hostname; + +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); + svr.reset( + new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) + ); + } else { + LOG_INF("Running without SSL\n"); + svr.reset(new httplib::Server()); + } +#else + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_ERR("Server is built without SSL support\n"); + return false; + } + svr.reset(new httplib::Server()); +#endif + + svr->set_default_headers({{"Server", "llama.cpp"}}); + svr->set_logger(log_server_request); + svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { + std::string message; + try { + std::rethrow_exception(ep); + } catch (const std::exception & e) { + message = e.what(); + } catch (...) { + message = "Unknown Exception"; + } + + // FIXME + GGML_UNUSED(res); + GGML_UNUSED(message); + // try { + // json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); + // LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); + // res_error(res, formatted_error); + // } catch (const std::exception & e) { + // LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); + // } + }); + + svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { + if (res.status == 404) { + // FIXME + //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); + res.set_content("404 Not Found", "text/plain"); + } + // for other error codes, we skip processing here because it's already done by res_error() + }); + + // set timeouts and change hostname and port + svr->set_read_timeout (params.timeout_read); + svr->set_write_timeout(params.timeout_write); + + if (params.api_keys.size() == 1) { + auto key = params.api_keys[0]; + std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); + LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str()); + } else if (params.api_keys.size() > 1) { + LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size()); + } + + // + // Middlewares + // + + auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { + static const std::unordered_set public_endpoints = { + "/health", + "/v1/health", + "/models", + "/v1/models", + "/api/tags" + }; + + // If API key is not set, skip validation + if (api_keys.empty()) { + return true; + } + + // If path is public or is static file, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { + return true; + } + + // Check for API key in the header + auto auth_header = req.get_header_value("Authorization"); + + std::string prefix = "Bearer "; + if (auth_header.substr(0, prefix.size()) == prefix) { + std::string received_api_key = auth_header.substr(prefix.size()); + if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) { + return true; // API key is valid + } + } + + // API key is invalid or not provided + //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); + // FIXME + res.status = 401; + res.set_content("Unauthorized: Invalid API Key", "text/plain"); + + LOG_WRN("Unauthorized: Invalid API Key\n"); + + return false; + }; + + auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { + bool ready = is_ready.load(); + if (!ready) { + auto tmp = string_split(req.path, '.'); + if (req.path == "/" || tmp.back() == "html") { + res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); + res.status = 503; + } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { + // allow the models endpoint to be accessed during loading + return true; + } else { + // FIXME + //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + res.status = 503; + res.set_content("503 Service Unavailable: Loading model", "text/plain"); + } + return false; + } + return true; + }; + + // register server middlewares + svr->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + // If this is OPTIONS request, skip validation because browsers don't include Authorization header + if (req.method == "OPTIONS") { + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "GET, POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + res.set_content("", "text/html"); // blank response, no data + return httplib::Server::HandlerResponse::Handled; // skip further processing + } + if (!middleware_server_state(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + if (!middleware_validate_api_key(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + return httplib::Server::HandlerResponse::Unhandled; + }); + + int n_threads_http = params.n_threads_http; + if (n_threads_http < 1) { + // +2 threads for monitoring endpoints + n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + } + LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); + svr->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); }; + + // + // Web UI setup + // + + if (!params.webui) { + LOG_INF("Web UI is disabled\n"); + } else { + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } + } else { + // using embedded static index.html + svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + } else { + res.set_header("Content-Encoding", "gzip"); + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); + } + return false; + }); + } + } + return true; +} + +bool server_http_context::start() { + // Bind and listen + + bool was_bound = false; + bool is_sock = false; + if (string_ends_with(std::string(hostname), ".sock")) { + is_sock = true; + LOG_INF("%s: setting address family to AF_UNIX\n", __func__); + svr->set_address_family(AF_UNIX); + // bind_to_port requires a second arg, any value other than 0 should + // simply get ignored + was_bound = svr->bind_to_port(hostname, 8080); + } else { + LOG_INF("%s: binding port with default address family\n", __func__); + // bind HTTP listen port + if (port == 0) { + int bound_port = svr->bind_to_any_port(hostname); + if ((was_bound = (bound_port >= 0))) { + port = bound_port; + } + } else { + was_bound = svr->bind_to_port(hostname, port); + } + } + + if (!was_bound) { + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port); + return false; + } + + // run the HTTP server in a thread + thread = std::thread([this]() { svr->listen_after_bind(); }); + svr->wait_until_ready(); + + LOG_INF("%s: server is listening on %s\n", __func__, + is_sock ? string_format("unix://%s", hostname.c_str()).c_str() : + string_format("http://%s:%d", hostname.c_str(), port).c_str()); + return true; +} + +void server_http_context::stop() { + if (svr) { + svr->stop(); + } +} + +void server_http_context::get(const std::string & path, server_http_context::handler_t handler) { + svr->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_resgen_ptr response = handler(server_http_request{ + req.path_params, + json{}, + req.is_connection_closed + }); + GGML_ASSERT(!response->is_stream() && "not supported for GET method"); + res.status = response->status; + res.set_content(response->data, response->content_type); + }); +} + +void server_http_context::post(const std::string & path, server_http_context::handler_t handler) { + svr->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_resgen_ptr response = handler(server_http_request{ + req.path_params, + json::parse(req.body.empty() ? "{}" : req.body), + req.is_connection_closed + }); + if (response->is_stream()) { + res.status = response->status; + std::string content_type = response->content_type; + // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it + std::shared_ptr r_ptr = std::move(response); + const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { + // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed() + sink.write(response->data.data(), response->data.size()); + SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); + if (!response->next()) { + SRV_DBG("%s", "http: stream ended\n"); + sink.done(); + return false; // end of stream + } + return true; + }; + const auto on_complete = [response = r_ptr](bool) mutable { + response.reset(); // trigger the destruction of the response object + }; + res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); + } else { + res.status = response->status; + res.set_content(response->data, response->content_type); + } + }); +} diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0b3c77879c2e2..e4f0167fa838c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1,5 +1,6 @@ #include "chat.h" #include "utils.hpp" +#include "server-http.h" #include "arg.h" #include "common.h" @@ -10,13 +11,6 @@ #include "speculative.h" #include "mtmd.h" -// mime type for sending response -#define MIMETYPE_JSON "application/json; charset=utf-8" - -// auto generated files (see README.md for details) -#include "index.html.gz.hpp" -#include "loading.html.hpp" - #include #include #include @@ -2382,6 +2376,7 @@ struct server_context { llama_batch_free(batch); } + // load the model and initialize llama_context bool load_model(const common_params & params) { SRV_INF("loading model '%s'\n", params.model.path.c_str()); @@ -2500,6 +2495,7 @@ struct server_context { return true; } + // initialize slots and server-related data void init() { SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); @@ -4319,6 +4315,7 @@ struct server_context { } }; + // generator-like API for server responses, support pooling connection state and aggregating results struct server_response_reader { std::unordered_set id_tasks; @@ -4417,591 +4414,81 @@ struct server_response_reader { } }; -static void log_server_request(const httplib::Request & req, const httplib::Response & res) { - // skip GH copilot requests when using default port - if (req.path == "/v1/health") { - return; +// generator-like API for HTTP response generation +struct server_resgen : server_http_resgen { + server_response_reader rd; + server_resgen(server_context & ctx_server_) : rd(ctx_server_) {} + void ok(const json & response_data) { + status = 200; + data = safe_json_to_str(response_data); } - - // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - - SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - - SRV_DBG("request: %s\n", req.body.c_str()); - SRV_DBG("response: %s\n", res.body.c_str()); -} - -static void res_error(httplib::Response & res, const json & error_data) { - json final_response {{"error", error_data}}; - res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON); - res.status = json_value(error_data, "code", 500); -} - -static void res_ok(httplib::Response & res, const json & data) { - res.set_content(safe_json_to_str(data), MIMETYPE_JSON); - res.status = 200; -} - -std::function shutdown_handler; -std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; - -inline void signal_handler(int signal) { - if (is_terminating.test_and_set()) { - // in case it hangs, we can force terminate the server by hitting Ctrl+C twice - // this is for better developer experience, we can remove when the server is stable enough - fprintf(stderr, "Received second interrupt, terminating immediately.\n"); - exit(1); - } - - shutdown_handler(signal); -} - -int main(int argc, char ** argv) { - // own arguments required by this example - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { - return 1; - } - - // TODO: should we have a separate n_parallel parameter for the server? - // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 - // TODO: this is a common configuration that is suitable for most local use cases - // however, overriding the parameters is a bit confusing - figure out something more intuitive - if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { - LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); - - params.n_parallel = 4; - params.kv_unified = true; - } - - common_init(); - - // struct that contains llama context and inference - server_context ctx_server; - - llama_backend_init(); - llama_numa_init(params.numa); - - LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); - - std::unique_ptr svr; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); - svr.reset( - new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) - ); - } else { - LOG_INF("Running without SSL\n"); - svr.reset(new httplib::Server()); - } -#else - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_ERR("Server is built without SSL support\n"); - return 1; - } - svr.reset(new httplib::Server()); -#endif - - std::atomic state{SERVER_STATE_LOADING_MODEL}; - - svr->set_default_headers({{"Server", "llama.cpp"}}); - svr->set_logger(log_server_request); - svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { - std::string message; - try { - std::rethrow_exception(ep); - } catch (const std::exception & e) { - message = e.what(); - } catch (...) { - message = "Unknown Exception"; - } - - try { - json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - res_error(res, formatted_error); - } catch (const std::exception & e) { - LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); - } - }); - - svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { - if (res.status == 404) { - res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); - } - // for other error codes, we skip processing here because it's already done by res_error() - }); - - // set timeouts and change hostname and port - svr->set_read_timeout (params.timeout_read); - svr->set_write_timeout(params.timeout_write); - - std::unordered_map log_data; - - log_data["hostname"] = params.hostname; - log_data["port"] = std::to_string(params.port); - - if (params.api_keys.size() == 1) { - auto key = params.api_keys[0]; - log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0)); - } else if (params.api_keys.size() > 1) { - log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; + void error(const json & error_data) { + status = json_value(error_data, "code", 500); + data = safe_json_to_str(error_data); } +}; - // Necessary similarity of prompt for slot selection - ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; - - // - // Middlewares - // - - auto middleware_validate_api_key = [¶ms](const httplib::Request & req, httplib::Response & res) { - static const std::unordered_set public_endpoints = { - "/health", - "/v1/health", - "/models", - "/v1/models", - "/api/tags" - }; - - // If API key is not set, skip validation - if (params.api_keys.empty()) { - return true; - } - - // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { - return true; - } - - // Check for API key in the header - auto auth_header = req.get_header_value("Authorization"); - - std::string prefix = "Bearer "; - if (auth_header.substr(0, prefix.size()) == prefix) { - std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) { - return true; // API key is valid - } - } - - // API key is invalid or not provided - res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - - LOG_WRN("Unauthorized: Invalid API Key\n"); - - return false; - }; - - auto middleware_server_state = [&state](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - if (current_state == SERVER_STATE_LOADING_MODEL) { - auto tmp = string_split(req.path, '.'); - if (req.path == "/" || tmp.back() == "html") { - res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); - res.status = 503; - } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { - // allow the models endpoint to be accessed during loading - return true; - } else { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - } - return false; - } - return true; - }; - - // register server middlewares - svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - // If this is OPTIONS request, skip validation because browsers don't include Authorization header - if (req.method == "OPTIONS") { - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "GET, POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - res.set_content("", "text/html"); // blank response, no data - return httplib::Server::HandlerResponse::Handled; // skip further processing - } - if (!middleware_server_state(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - if (!middleware_validate_api_key(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - return httplib::Server::HandlerResponse::Unhandled; - }); +struct server_routes { + server_context & ctx_server; + server_routes(server_context & ctx_server_) : ctx_server(ctx_server_) {} - // - // Route handlers (or controllers) - // +public: + // handlers using lambda function, so that they can capture `this` without `std::bind` - const auto handle_health = [&](const httplib::Request &, httplib::Response & res) { + server_http_context::handler_t get_health = [this](const server_http_request &) { // error and loading states are handled by middleware - json health = {{"status", "ok"}}; - res_ok(res, health); - }; - - const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { - if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_METRICS); - task.id = task_id; - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task), true); // high-priority task - } - - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); - GGML_ASSERT(res_task != nullptr); - - // optionally return "fail_on_no_slot" error - if (req.has_param("fail_on_no_slot")) { - if (res_task->n_idle_slots == 0) { - res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); - return; - } - } - - res_ok(res, res_task->slots_data); - }; - - const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { - if (!params.endpoint_metrics) { - res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_METRICS); - task.id = task_id; - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task), true); // high-priority task - } - - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); - GGML_ASSERT(res_task != nullptr); - - // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names - json all_metrics_def = json { - {"counter", {{ - {"name", "prompt_tokens_total"}, - {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) res_task->n_prompt_tokens_processed_total} - }, { - {"name", "prompt_seconds_total"}, - {"help", "Prompt process time"}, - {"value", (uint64_t) res_task->t_prompt_processing_total / 1.e3} - }, { - {"name", "tokens_predicted_total"}, - {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) res_task->n_tokens_predicted_total} - }, { - {"name", "tokens_predicted_seconds_total"}, - {"help", "Predict process time"}, - {"value", (uint64_t) res_task->t_tokens_generation_total / 1.e3} - }, { - {"name", "n_decode_total"}, - {"help", "Total number of llama_decode() calls"}, - {"value", res_task->n_decode_total} - }, { - {"name", "n_tokens_max"}, - {"help", "Largest observed n_tokens."}, - {"value", res_task->n_tokens_max} - }, { - {"name", "n_busy_slots_per_decode"}, - {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)} - }}}, - {"gauge", {{ - {"name", "prompt_tokens_seconds"}, - {"help", "Average prompt throughput in tokens/s."}, - {"value", res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.} - },{ - {"name", "predicted_tokens_seconds"}, - {"help", "Average generation throughput in tokens/s."}, - {"value", res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.} - },{ - {"name", "requests_processing"}, - {"help", "Number of requests processing."}, - {"value", (uint64_t) res_task->n_processing_slots} - },{ - {"name", "requests_deferred"}, - {"help", "Number of requests deferred."}, - {"value", (uint64_t) res_task->n_tasks_deferred} - }}} - }; - - std::stringstream prometheus; - - for (const auto & el : all_metrics_def.items()) { - const auto & type = el.key(); - const auto & metrics_def = el.value(); - - for (const auto & metric_def : metrics_def) { - const std::string name = metric_def.at("name"); - const std::string help = metric_def.at("help"); - - auto value = json_value(metric_def, "value", 0.); - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << value << "\n"; - } - } - - res.set_header("Process-Start-Time-Unix", std::to_string(res_task->t_start)); - - res.set_content(prometheus.str(), "text/plain; version=0.0.4"); - res.status = 200; // HTTP OK - }; - - const auto handle_slots_save = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_SAVE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - res_ok(res, result->to_json()); - }; - - const auto handle_slots_restore = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; - - const auto handle_slots_erase = [&ctx_server](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_ERASE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; - - const auto handle_slots_action = [¶ms, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { - if (params.slot_save_path.empty()) { - res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - std::string id_slot_str = req.path_params.at("id_slot"); - int id_slot; - - try { - id_slot = std::stoi(id_slot_str); - } catch (const std::exception &) { - res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - std::string action = req.get_param_value("action"); - - if (action == "save") { - handle_slots_save(req, res, id_slot); - } else if (action == "restore") { - handle_slots_restore(req, res, id_slot); - } else if (action == "erase") { - handle_slots_erase(req, res, id_slot); - } else { - res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); - } + auto res = std::make_unique(ctx_server); + res->ok({{"status", "ok"}}); + return res; }; - const auto handle_props = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { - json default_generation_settings_for_props; - - { - slot_params params; - - params.sampling = ctx_server.params_base.sampling; - - default_generation_settings_for_props = json { - {"params", params.to_json(true)}, - {"n_ctx", ctx_server.slots[0].n_ctx}, - }; - } - - // this endpoint is publicly available, please only return what is safe to be exposed - json data = { - { "default_generation_settings", default_generation_settings_for_props }, - { "total_slots", ctx_server.params_base.n_parallel }, - { "model_alias", ctx_server.params_base.model_alias }, - { "model_path", ctx_server.params_base.model.path }, - { "modalities", json { - {"vision", ctx_server.oai_parser_opt.allow_image}, - {"audio", ctx_server.oai_parser_opt.allow_audio}, - } }, - { "endpoint_slots", params.endpoint_slots }, - { "endpoint_props", params.endpoint_props }, - { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, - { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, - { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, - { "build_info", build_info }, - }; - if (ctx_server.params_base.use_jinja) { - if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { - data["chat_template_tool_use"] = tool_use_src; - } - } - - res_ok(res, data); + server_http_context::handler_t post_completions = [this](const server_http_request & req) { + std::vector files; // dummy + return handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + req.body, + files, + req.should_stop, + OAICOMPAT_TYPE_NONE); }; - const auto handle_props_change = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.endpoint_props) { - res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - json data = json::parse(req.body); - - // update any props here - - res_ok(res, {{ "success", true }}); + server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) { + std::vector files; // dummy + return handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + req.body, + files, + req.should_stop, + OAICOMPAT_TYPE_COMPLETION); }; - const auto handle_api_show = [&ctx_server](const httplib::Request &, httplib::Response & res) { - bool has_mtmd = ctx_server.mctx != nullptr; - json data = { - { - "template", common_chat_templates_source(ctx_server.chat_templates.get()), - }, - { - "model_info", { - { "llama.context_length", ctx_server.slots.back().n_ctx, }, - } - }, - {"modelfile", ""}, - {"parameters", ""}, - {"template", common_chat_templates_source(ctx_server.chat_templates.get())}, - {"details", { - {"parent_model", ""}, - {"format", "gguf"}, - {"family", ""}, - {"families", {""}}, - {"parameter_size", ""}, - {"quantization_level", ""} - }}, - {"model_info", ""}, - {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})} - }; - - res_ok(res, data); + server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) { + std::vector files; + json body_cpy = req.body; + json data = oaicompat_chat_params_parse( + body_cpy, + ctx_server.oai_parser_opt, + files); + return handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + body_cpy, + files, + req.should_stop, + OAICOMPAT_TYPE_CHAT); }; - // handle completion-like requests (completion, chat, infill) - // we can optionally provide a custom format for partial results and final results - const auto handle_completions_impl = [&ctx_server]( - server_task_type type, - json & data, - const std::vector & files, - const std::function & is_connection_closed, - httplib::Response & res, - oaicompat_type oaicompat) -> void { +private: + server_http_resgen_ptr handle_completions_impl( + server_task_type type, + const json & data, + const std::vector & files, + const std::function & should_stop, + oaicompat_type oaicompat) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); + auto res = std::make_unique(ctx_server); auto completion_id = gen_chatcmplid(); - // need to store the reader as a pointer, so that it won't be destroyed when the handle returns - // use shared_ptr as it's shared between the chunked_content_provider() and on_complete() - const auto rd = std::make_shared(ctx_server); + auto & rd = res->rd; try { std::vector tasks; @@ -5042,22 +4529,22 @@ int main(int argc, char ** argv) { tasks.push_back(std::move(task)); } - rd->post_tasks(std::move(tasks)); + rd.post_tasks(std::move(tasks)); } catch (const std::exception & e) { - res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); - return; + res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + return res; } bool stream = json_value(data, "stream", false); if (!stream) { // non-stream, wait for the results - auto all_results = rd->wait_for_all(is_connection_closed); + auto all_results = rd.wait_for_all(should_stop); if (all_results.is_terminated) { - return; // connection is closed + return res; // connection is closed } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; + res->error(all_results.error->to_json()); + return res; } else { json arr = json::array(); for (auto & res : all_results.results) { @@ -5065,19 +4552,19 @@ int main(int argc, char ** argv) { arr.push_back(res->to_json()); } // if single request, return single object instead of array - res_ok(res, arr.size() == 1 ? arr[0] : arr); + res->ok(arr.size() == 1 ? arr[0] : arr); } } else { // in streaming mode, the first error must be treated as non-stream response // this is to match the OAI API behavior // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309 - server_task_result_ptr first_result = rd->next(is_connection_closed); + server_task_result_ptr first_result = rd.next(should_stop); if (first_result == nullptr) { - return; // connection is closed + return res; // connection is closed } else if (first_result->is_error()) { - res_error(res, first_result->to_json()); - return; + res->error(first_result->to_json()); + return res; } else { GGML_ASSERT( dynamic_cast(first_result.get()) != nullptr @@ -5086,656 +4573,138 @@ int main(int argc, char ** argv) { } // next responses are streamed - json first_result_json = first_result->to_json(); - const auto chunked_content_provider = [first_result_json, rd, oaicompat](size_t, httplib::DataSink & sink) mutable -> bool { - // flush the first result as it's not an error - if (!first_result_json.empty()) { - if (!server_sent_event(sink, first_result_json)) { - sink.done(); - return false; // sending failed, go to on_complete() - } - first_result_json.clear(); // mark as sent + res->ok(first_result->to_json()); + res->content_type = "text/event-stream"; + res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool { + if (should_stop()) { + SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); + return false; // should_stop condition met } + server_response_reader & rd = res_this->rd; + std::string & output = res_this->data; + output.clear(); + // receive subsequent results - auto result = rd->next([&sink]{ return !sink.is_writable(); }); + auto result = rd.next(should_stop); if (result == nullptr) { - sink.done(); - return false; // connection is closed, go to on_complete() + SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); + return false; // should_stop condition met } // send the results json res_json = result->to_json(); - bool ok = false; if (result->is_error()) { - ok = server_sent_event(sink, json {{ "error", result->to_json() }}); - sink.done(); - return false; // go to on_complete() + output += format_sse(res_json); + SRV_DBG("%s", "error received during streaming, terminating stream\n"); + return false; // terminate on error } else { GGML_ASSERT( dynamic_cast(result.get()) != nullptr || dynamic_cast(result.get()) != nullptr ); - ok = server_sent_event(sink, res_json); - } - - if (!ok) { - sink.done(); - return false; // sending failed, go to on_complete() + output += format_sse(res_json); } // check if there is more data - if (!rd->has_next()) { + if (!rd.has_next()) { if (oaicompat != OAICOMPAT_TYPE_NONE) { - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); + output += "data: [DONE]\n\n"; } - sink.done(); - return false; // no more data, go to on_complete() + SRV_DBG("%s", "all results received, terminating stream\n"); + return false; // no more data, terminate } // has next data, continue return true; }; - - auto on_complete = [rd](bool) { - rd->stop(); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } - }; - - const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = json::parse(req.body); - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); - }; - - const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = oaicompat_completion_params_parse(json::parse(req.body)); - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_COMPLETION); - }; - - const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - // check model compatibility - std::string err; - if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "prefix token is missing. "; - } - if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "suffix token is missing. "; - } - if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "middle token is missing. "; - } - if (!err.empty()) { - res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - json data = json::parse(req.body); - - // validate input - if (data.contains("prompt") && !data.at("prompt").is_string()) { - // prompt is optional - res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_prefix")) { - res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_suffix")) { - res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (data.contains("input_extra") && !data.at("input_extra").is_array()) { - // input_extra is optional - res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - json input_extra = json_value(data, "input_extra", json::array()); - for (const auto & chunk : input_extra) { - // { "text": string, "filename": string } - if (!chunk.contains("text") || !chunk.at("text").is_string()) { - res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); - return; - } - // filename is optional - if (chunk.contains("filename") && !chunk.at("filename").is_string()) { - res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - data["input_extra"] = input_extra; // default to empty array if it's not exist - - std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true); - SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - data["prompt"] = format_infill( - ctx_server.vocab, - data.at("input_prefix"), - data.at("input_suffix"), - data.at("input_extra"), - ctx_server.params_base.n_batch, - ctx_server.params_base.n_predict, - ctx_server.slots[0].n_ctx, // TODO: there should be a better way - ctx_server.params_base.spm_infill, - tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal. - ); - - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_INFILL, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); // infill is not OAI compatible - }; - - const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - LOG_DBG("request: %s\n", req.body.c_str()); - - auto body = json::parse(req.body); - std::vector files; - json data = oaicompat_chat_params_parse( - body, - ctx_server.oai_parser_opt, - files); - - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_CHAT); - }; - - // same with handle_chat_completions, but without inference part - const auto handle_apply_template = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - auto body = json::parse(req.body); - std::vector files; // dummy, unused - json data = oaicompat_chat_params_parse( - body, - ctx_server.oai_parser_opt, - files); - res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); - }; - - const auto handle_models = [¶ms, &ctx_server, &state](const httplib::Request &, httplib::Response & res) { - server_state current_state = state.load(); - json model_meta = nullptr; - if (current_state == SERVER_STATE_READY) { - model_meta = ctx_server.model_meta(); } - bool has_mtmd = ctx_server.mctx != nullptr; - json models = { - {"models", { - { - {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"modified_at", ""}, - {"size", ""}, - {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash - {"type", "model"}, - {"description", ""}, - {"tags", {""}}, - {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}, - {"parameters", ""}, - {"details", { - {"parent_model", ""}, - {"format", "gguf"}, - {"family", ""}, - {"families", {""}}, - {"parameter_size", ""}, - {"quantization_level", ""} - }} - } - }}, - {"object", "list"}, - {"data", { - { - {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"object", "model"}, - {"created", std::time(0)}, - {"owned_by", "llamacpp"}, - {"meta", model_meta}, - }, - }} - }; - - res_ok(res, models); - }; - const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - - json tokens_response = json::array(); - if (body.count("content") != 0) { - const bool add_special = json_value(body, "add_special", false); - const bool parse_special = json_value(body, "parse_special", true); - const bool with_pieces = json_value(body, "with_pieces", false); - - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); - - if (with_pieces) { - for (const auto& token : tokens) { - std::string piece = common_token_to_piece(ctx_server.ctx, token); - json piece_json; - - // Check if the piece is valid UTF-8 - if (is_valid_utf8(piece)) { - piece_json = piece; - } else { - // If not valid UTF-8, store as array of byte values - piece_json = json::array(); - for (unsigned char c : piece) { - piece_json.push_back(static_cast(c)); - } - } - - tokens_response.push_back({ - {"id", token}, - {"piece", piece_json} - }); - } - } else { - tokens_response = tokens; - } - } - - const json data = format_tokenizer_response(tokens_response); - res_ok(res, data); - }; - - const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - - std::string content; - if (body.count("tokens") != 0) { - const llama_tokens tokens = body.at("tokens"); - content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); - } - - const json data = format_detokenized_response(content); - res_ok(res, data); - }; - - const auto handle_embeddings_impl = [&ctx_server](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) { - if (!ctx_server.params_base.embedding) { - res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { - res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - const json body = json::parse(req.body); - - // for the shape of input/content, see tokenize_input_prompts() - json prompt; - if (body.count("input") != 0) { - prompt = body.at("input"); - } else if (body.contains("content")) { - oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible - prompt = body.at("content"); - } else { - res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - bool use_base64 = false; - if (body.count("encoding_format") != 0) { - const std::string& format = body.at("encoding_format"); - if (format == "base64") { - use_base64 = true; - } else if (format != "float") { - res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); - for (const auto & tokens : tokenized_prompts) { - // this check is necessary for models that do not add BOS token to the input - if (tokens.empty()) { - res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - - int embd_normalize = 2; // default to Euclidean/L2 norm - if (body.count("embd_normalize") != 0) { - embd_normalize = body.at("embd_normalize"); - if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { - SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx)); - } - } - - // create and queue the task - json responses = json::array(); - server_response_reader rd(ctx_server); - { - std::vector tasks; - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); - - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.tokens = std::move(tokenized_prompts[i]); - - // OAI-compat - task.params.oaicompat = oaicompat; - task.params.embd_normalize = embd_normalize; - - tasks.push_back(std::move(task)); - } - rd.post_tasks(std::move(tasks)); - } - - // wait for the results - auto all_results = rd.wait_for_all(req.is_connection_closed); - - // collect results - if (all_results.is_terminated) { - return; // connection is closed - } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; - } else { - for (auto & res : all_results.results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); - } - } - - // write JSON response - json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING - ? format_embeddings_response_oaicompat(body, responses, use_base64) - : json(responses); - res_ok(res, root); - }; - - const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); - }; - - const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING); - }; - - const auto handle_rerank = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { - res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - const json body = json::parse(req.body); - - // if true, use TEI API format, otherwise use Jina API format - // Jina: https://jina.ai/reranker/ - // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank - bool is_tei_format = body.contains("texts"); - - json query; - if (body.count("query") == 1) { - query = body.at("query"); - if (!query.is_string()) { - res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } else { - res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } + return res; + } +}; - std::vector documents = json_value(body, "documents", - json_value(body, "texts", std::vector())); - if (documents.empty()) { - res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); - return; - } +std::function shutdown_handler; +std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; - int top_n = json_value(body, "top_n", (int)documents.size()); +inline void signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice + // this is for better developer experience, we can remove when the server is stable enough + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); + exit(1); + } - // create and queue the task - json responses = json::array(); - server_response_reader rd(ctx_server); - { - std::vector tasks; - tasks.reserve(documents.size()); - for (size_t i = 0; i < documents.size(); i++) { - auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); - server_task task = server_task(SERVER_TASK_TYPE_RERANK); - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.tokens = std::move(tmp); - tasks.push_back(std::move(task)); - } - rd.post_tasks(std::move(tasks)); - } + shutdown_handler(signal); +} - // wait for the results - auto all_results = rd.wait_for_all(req.is_connection_closed); +int main(int argc, char ** argv) { + // own arguments required by this example + common_params params; - // collect results - if (all_results.is_terminated) { - return; // connection is closed - } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; - } else { - for (auto & res : all_results.results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); - } - } + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + return 1; + } - // write JSON response - json root = format_response_rerank( - body, - responses, - is_tei_format, - documents, - top_n); + // TODO: should we have a separate n_parallel parameter for the server? + // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 + // TODO: this is a common configuration that is suitable for most local use cases + // however, overriding the parameters is a bit confusing - figure out something more intuitive + if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { + LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); - res_ok(res, root); - }; + params.n_parallel = 4; + params.kv_unified = true; + } - const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { - json result = json::array(); - const auto & loras = ctx_server.params_base.lora_adapters; - for (size_t i = 0; i < loras.size(); ++i) { - auto & lora = loras[i]; - json entry = { - {"id", i}, - {"path", lora.path}, - {"scale", lora.scale}, - {"task_name", lora.task_name}, - {"prompt_prefix", lora.prompt_prefix}, - }; - std::string alora_invocation_string = ""; - const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); - std::vector alora_invocation_tokens; - if (n_alora_tokens) { - const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); - for (uint64_t i = 0; i < n_alora_tokens; ++i) { - alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]); - alora_invocation_tokens.push_back(alora_tokens[i]); - } - entry["alora_invocation_string"] = alora_invocation_string; - entry["alora_invocation_tokens"] = alora_invocation_tokens; - } - result.push_back(std::move(entry)); - } - res_ok(res, result); - res.status = 200; // HTTP OK - }; + common_init(); - const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - if (!body.is_array()) { - res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); - return; - } + // struct that contains llama context and inference + server_context ctx_server; - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SET_LORA); - task.id = task_id; - task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } + // Necessary similarity of prompt for slot selection + ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); + llama_backend_init(); + llama_numa_init(params.numa); - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; + server_http_context ctx_http; + if (!ctx_http.init(params)) { + LOG_ERR("%s: failed to initialize HTTP server\n", __func__); + return 1; + } // // Router // - if (!params.webui) { - LOG_INF("Web UI is disabled\n"); - } else { - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } - } else { - // using embedded static index.html - svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { - if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { - res.set_content("Error: gzip is not supported by this browser", "text/plain"); - } else { - res.set_header("Content-Encoding", "gzip"); - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); - } - return false; - }); - } - } - // register API routes - svr->Get (params.api_prefix + "/health", handle_health); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/v1/health", handle_health); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/metrics", handle_metrics); - svr->Get (params.api_prefix + "/props", handle_props); - svr->Post(params.api_prefix + "/props", handle_props_change); - svr->Post(params.api_prefix + "/api/show", handle_api_show); - svr->Get (params.api_prefix + "/models", handle_models); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/v1/models", handle_models); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check) - svr->Post(params.api_prefix + "/completion", handle_completions); // legacy - svr->Post(params.api_prefix + "/completions", handle_completions); - svr->Post(params.api_prefix + "/v1/completions", handle_completions_oai); - svr->Post(params.api_prefix + "/chat/completions", handle_chat_completions); - svr->Post(params.api_prefix + "/v1/chat/completions", handle_chat_completions); - svr->Post(params.api_prefix + "/api/chat", handle_chat_completions); // ollama specific endpoint - svr->Post(params.api_prefix + "/infill", handle_infill); - svr->Post(params.api_prefix + "/embedding", handle_embeddings); // legacy - svr->Post(params.api_prefix + "/embeddings", handle_embeddings); - svr->Post(params.api_prefix + "/v1/embeddings", handle_embeddings_oai); - svr->Post(params.api_prefix + "/rerank", handle_rerank); - svr->Post(params.api_prefix + "/reranking", handle_rerank); - svr->Post(params.api_prefix + "/v1/rerank", handle_rerank); - svr->Post(params.api_prefix + "/v1/reranking", handle_rerank); - svr->Post(params.api_prefix + "/tokenize", handle_tokenize); - svr->Post(params.api_prefix + "/detokenize", handle_detokenize); - svr->Post(params.api_prefix + "/apply-template", handle_apply_template); - // LoRA adapters hotswap - svr->Get (params.api_prefix + "/lora-adapters", handle_lora_adapters_list); - svr->Post(params.api_prefix + "/lora-adapters", handle_lora_adapters_apply); - // Save & load slots - svr->Get (params.api_prefix + "/slots", handle_slots); - svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action); + server_routes routes(ctx_server); + ctx_http.get ("/health", routes.get_health); + ctx_http.post("/completion", routes.post_completions); // legacy + ctx_http.post("/completions", routes.post_completions); + ctx_http.post("/v1/completions", routes.post_completions_oai); + ctx_http.post("/chat/completions", routes.post_chat_completions); + ctx_http.post("/v1/chat/completions", routes.post_chat_completions); + ctx_http.post("/api/chat", routes.post_chat_completions); // ollama specific endpoint // // Start the server // - if (params.n_threads_http < 1) { - // +2 threads for monitoring endpoints - params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - log_data["n_threads_http"] = std::to_string(params.n_threads_http); - svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - // clean up function, to be called before exit - auto clean_up = [&svr, &ctx_server]() { + // setup clean up function, to be called before exit + auto clean_up = [&ctx_http, &ctx_server]() { SRV_INF("%s: cleaning up before exit...\n", __func__); - svr->stop(); + ctx_http.stop(); ctx_server.queue_results.terminate(); llama_backend_free(); }; - bool was_bound = false; - bool is_sock = false; - if (string_ends_with(std::string(params.hostname), ".sock")) { - is_sock = true; - LOG_INF("%s: setting address family to AF_UNIX\n", __func__); - svr->set_address_family(AF_UNIX); - // bind_to_port requires a second arg, any value other than 0 should - // simply get ignored - was_bound = svr->bind_to_port(params.hostname, 8080); - } else { - LOG_INF("%s: binding port with default address family\n", __func__); - // bind HTTP listen port - if (params.port == 0) { - int bound_port = svr->bind_to_any_port(params.hostname); - if ((was_bound = (bound_port >= 0))) { - params.port = bound_port; - } - } else { - was_bound = svr->bind_to_port(params.hostname, params.port); - } - } - - if (!was_bound) { - LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); - clean_up(); - return 1; - } - - // run the HTTP server in a thread - std::thread t([&]() { svr->listen_after_bind(); }); - svr->wait_until_ready(); - LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); // load the model @@ -5743,13 +4712,13 @@ int main(int argc, char ** argv) { if (!ctx_server.load_model(params)) { clean_up(); - t.join(); + ctx_http.thread.join(); LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; } ctx_server.init(); - state.store(SERVER_STATE_READY); + ctx_http.is_ready.store(true); LOG_INF("%s: model loaded\n", __func__); @@ -5785,15 +4754,18 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__, - is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() : - string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str()); + if (!ctx_http.start()) { + clean_up(); + LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + return 1; + } + LOG_INF("%s: starting the main loop...\n", __func__); // this call blocks the main thread until queue_tasks.terminate() is called ctx_server.queue_tasks.start_loop(); clean_up(); - t.join(); + ctx_http.thread.join(); llama_memory_breakdown_print(ctx_server.ctx); return 0; diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index b1ecc5af5ed0a..2b20cf0b87c99 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -453,29 +453,25 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } +// format server-sent event (SSE), return the formatted string to send // note: if data is a json array, it will be sent as multiple events, one per item -static bool server_sent_event(httplib::DataSink & sink, const json & data) { - static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool { - const std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + +static std::string format_sse(const json & data) { + std::ostringstream ss; + auto send_single = [&ss](const json & data) { + ss << "data: " << + data.dump(-1, ' ', false, json::error_handler_t::replace) << "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). - - LOG_DBG("data stream, to_send: %s", str.c_str()); - return sink.write(str.c_str(), str.size()); }; if (data.is_array()) { for (const auto & item : data) { - if (!send_single(sink, item)) { - return false; - } + send_single(item); } } else { - return send_single(sink, data); + send_single(data); } - return true; + return ss.str(); } // From fe98058fd88caf0d0d3c627622ae4e5567689c7c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 21:00:15 +0100 Subject: [PATCH 02/21] move server-http and httplib to its own file --- tools/server/CMakeLists.txt | 2 + tools/server/server-http.cpp | 354 +++++++++++++++++++++++++++++++++++ tools/server/server-http.h | 335 +++------------------------------ tools/server/utils.hpp | 5 +- 4 files changed, 380 insertions(+), 316 deletions(-) create mode 100644 tools/server/server-http.cpp diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index c801e84c3d415..1fccfdd17f138 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -14,6 +14,8 @@ endif() set(TARGET_SRCS server.cpp utils.hpp + server-http.cpp + server-http.h ) set(PUBLIC_ASSETS index.html.gz diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp new file mode 100644 index 0000000000000..0324e8d69f96c --- /dev/null +++ b/tools/server/server-http.cpp @@ -0,0 +1,354 @@ +#include "utils.hpp" +#include "common.h" +#include "server-http.h" + +#include + +#include +#include +#include + +// auto generated files (see README.md for details) +#include "index.html.gz.hpp" +#include "loading.html.hpp" + +// +// HTTP implementation using cpp-httplib +// + +class server_http_context::Impl { +public: + std::unique_ptr srv; +}; + +server_http_context::server_http_context() + : pimpl(std::make_unique()) +{} + +server_http_context::~server_http_context() = default; + +static void log_server_request(const httplib::Request & req, const httplib::Response & res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health") { + return; + } + + // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch + + SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + + SRV_DBG("request: %s\n", req.body.c_str()); + SRV_DBG("response: %s\n", res.body.c_str()); +} + +bool server_http_context::init(const common_params & params) { + path_prefix = params.api_prefix; + port = params.port; + hostname = params.hostname; + +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); + svr.reset( + new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) + ); + } else { + LOG_INF("Running without SSL\n"); + svr.reset(new httplib::Server()); + } +#else + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_ERR("Server is built without SSL support\n"); + return false; + } + pimpl->srv.reset(new httplib::Server()); +#endif + + auto & srv = pimpl->srv; + srv->set_default_headers({{"Server", "llama.cpp"}}); + srv->set_logger(log_server_request); + srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { + std::string message; + try { + std::rethrow_exception(ep); + } catch (const std::exception & e) { + message = e.what(); + } catch (...) { + message = "Unknown Exception"; + } + + // FIXME + GGML_UNUSED(res); + GGML_UNUSED(message); + // try { + // json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); + // LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); + // res_error(res, formatted_error); + // } catch (const std::exception & e) { + // LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); + // } + }); + + srv->set_error_handler([](const httplib::Request &, httplib::Response & res) { + if (res.status == 404) { + // FIXME + //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); + res.set_content("404 Not Found", "text/plain"); + } + // for other error codes, we skip processing here because it's already done by res_error() + }); + + // set timeouts and change hostname and port + srv->set_read_timeout (params.timeout_read); + srv->set_write_timeout(params.timeout_write); + + if (params.api_keys.size() == 1) { + auto key = params.api_keys[0]; + std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); + LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str()); + } else if (params.api_keys.size() > 1) { + LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size()); + } + + // + // Middlewares + // + + auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { + static const std::unordered_set public_endpoints = { + "/health", + "/v1/health", + "/models", + "/v1/models", + "/api/tags" + }; + + // If API key is not set, skip validation + if (api_keys.empty()) { + return true; + } + + // If path is public or is static file, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { + return true; + } + + // Check for API key in the header + auto auth_header = req.get_header_value("Authorization"); + + std::string prefix = "Bearer "; + if (auth_header.substr(0, prefix.size()) == prefix) { + std::string received_api_key = auth_header.substr(prefix.size()); + if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) { + return true; // API key is valid + } + } + + // API key is invalid or not provided + //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); + // FIXME + res.status = 401; + res.set_content("Unauthorized: Invalid API Key", "text/plain"); + + LOG_WRN("Unauthorized: Invalid API Key\n"); + + return false; + }; + + auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { + bool ready = is_ready.load(); + if (!ready) { + auto tmp = string_split(req.path, '.'); + if (req.path == "/" || tmp.back() == "html") { + res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); + res.status = 503; + } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { + // allow the models endpoint to be accessed during loading + return true; + } else { + // FIXME + //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + res.status = 503; + res.set_content("503 Service Unavailable: Loading model", "text/plain"); + } + return false; + } + return true; + }; + + // register server middlewares + srv->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + // If this is OPTIONS request, skip validation because browsers don't include Authorization header + if (req.method == "OPTIONS") { + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "GET, POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + res.set_content("", "text/html"); // blank response, no data + return httplib::Server::HandlerResponse::Handled; // skip further processing + } + if (!middleware_server_state(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + if (!middleware_validate_api_key(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + return httplib::Server::HandlerResponse::Unhandled; + }); + + int n_threads_http = params.n_threads_http; + if (n_threads_http < 1) { + // +2 threads for monitoring endpoints + n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + } + LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); + srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); }; + + // + // Web UI setup + // + + if (!params.webui) { + LOG_INF("Web UI is disabled\n"); + } else { + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } + } else { + // using embedded static index.html + srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + } else { + res.set_header("Content-Encoding", "gzip"); + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); + } + return false; + }); + } + } + return true; +} + +bool server_http_context::start() { + // Bind and listen + + auto & srv = pimpl->srv; + bool was_bound = false; + bool is_sock = false; + if (string_ends_with(std::string(hostname), ".sock")) { + is_sock = true; + LOG_INF("%s: setting address family to AF_UNIX\n", __func__); + srv->set_address_family(AF_UNIX); + // bind_to_port requires a second arg, any value other than 0 should + // simply get ignored + was_bound = srv->bind_to_port(hostname, 8080); + } else { + LOG_INF("%s: binding port with default address family\n", __func__); + // bind HTTP listen port + if (port == 0) { + int bound_port = srv->bind_to_any_port(hostname); + if ((was_bound = (bound_port >= 0))) { + port = bound_port; + } + } else { + was_bound = srv->bind_to_port(hostname, port); + } + } + + if (!was_bound) { + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port); + return false; + } + + // run the HTTP server in a thread + thread = std::thread([this]() { pimpl->srv->listen_after_bind(); }); + srv->wait_until_ready(); + + LOG_INF("%s: server is listening on %s\n", __func__, + is_sock ? string_format("unix://%s", hostname.c_str()).c_str() : + string_format("http://%s:%d", hostname.c_str(), port).c_str()); + return true; +} + +void server_http_context::stop() { + if (pimpl->srv) { + pimpl->srv->stop(); + } +} + +static void set_headers(httplib::Response & res, const std::map & headers) { + for (const auto & [key, value] : headers) { + res.set_header(key, value); + } +} + +static std::map get_params(const httplib::Request & req) { + std::map params; + for (const auto & [key, value] : req.params) { + params[key] = value; + } + for (const auto & [key, value] : req.path_params) { + params[key] = value; + } + return params; +} + +void server_http_context::get(const std::string & path, server_http_context::handler_t handler) { + pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_resgen_ptr response = handler(server_http_request{ + get_params(req), + json{}, + req.is_connection_closed + }); + GGML_ASSERT(!response->is_stream() && "not supported for GET method"); + res.status = response->status; + set_headers(res, response->headers); + res.set_content(response->data, response->content_type); + }); +} + +void server_http_context::post(const std::string & path, server_http_context::handler_t handler) { + pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_resgen_ptr response = handler(server_http_request{ + get_params(req), + json::parse(req.body.empty() ? "{}" : req.body), + req.is_connection_closed + }); + if (response->is_stream()) { + res.status = response->status; + set_headers(res, response->headers); + std::string content_type = response->content_type; + // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it + std::shared_ptr r_ptr = std::move(response); + const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { + // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed() + sink.write(response->data.data(), response->data.size()); + SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); + if (!response->next()) { + SRV_DBG("%s", "http: stream ended\n"); + sink.done(); + return false; // end of stream + } + return true; + }; + const auto on_complete = [response = r_ptr](bool) mutable { + response.reset(); // trigger the destruction of the response object + }; + res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); + } else { + res.status = response->status; + set_headers(res, response->headers); + res.set_content(response->data, response->content_type); + } + }); +} + diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 5d93b94b6ce95..b0dc510e2ec33 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -7,18 +7,17 @@ #include #include -// auto generated files (see README.md for details) -#include "index.html.gz.hpp" -#include "loading.html.hpp" - // generator-like API for HTTP response generation +// this object response with one of the 2 modes: +// 1) normal response: `data` contains the full response body +// 2) streaming response: each call to next() generates the next chunk, stored in `data` +// when next() returns false, the stream ends struct server_http_resgen { std::string content_type = "application/json; charset=utf-8"; int status = 200; std::string data; + std::map headers; - // if is_stream is true, next() will return true until the stream ends - // the data member will contain the next chunk of data to send // TODO: move this to a virtual function once we have proper polymorphism support std::function next = nullptr; bool is_stream() const { @@ -33,20 +32,33 @@ struct server_http_resgen { using server_http_resgen_ptr = std::unique_ptr; struct server_http_request { - std::unordered_map query_params; + std::map params; // path_params + query_params json body; const std::function & should_stop; + + std::string get_param(const std::string & key, const std::string & def = "") const { + auto it = params.find(key); + if (it != params.end()) { + return it->second; + } + return def; + } }; struct server_http_context { - std::thread thread; - std::unique_ptr svr; + class Impl; + std::unique_ptr pimpl; + + std::thread thread; // server thread std::atomic is_ready = false; std::string path_prefix; std::string hostname; int port; + server_http_context(); + ~server_http_context(); + bool init(const common_params & params); bool start(); void stop(); @@ -55,308 +67,3 @@ struct server_http_context { void get(const std::string &, handler_t); void post(const std::string &, handler_t); }; - -// implementation details - -static void log_server_request(const httplib::Request & req, const httplib::Response & res) { - // skip GH copilot requests when using default port - if (req.path == "/v1/health") { - return; - } - - // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - - SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - - SRV_DBG("request: %s\n", req.body.c_str()); - SRV_DBG("response: %s\n", res.body.c_str()); -} - -bool server_http_context::init(const common_params & params) { - path_prefix = params.api_prefix; - port = params.port; - hostname = params.hostname; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); - svr.reset( - new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) - ); - } else { - LOG_INF("Running without SSL\n"); - svr.reset(new httplib::Server()); - } -#else - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_ERR("Server is built without SSL support\n"); - return false; - } - svr.reset(new httplib::Server()); -#endif - - svr->set_default_headers({{"Server", "llama.cpp"}}); - svr->set_logger(log_server_request); - svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { - std::string message; - try { - std::rethrow_exception(ep); - } catch (const std::exception & e) { - message = e.what(); - } catch (...) { - message = "Unknown Exception"; - } - - // FIXME - GGML_UNUSED(res); - GGML_UNUSED(message); - // try { - // json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - // LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - // res_error(res, formatted_error); - // } catch (const std::exception & e) { - // LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); - // } - }); - - svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { - if (res.status == 404) { - // FIXME - //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); - res.set_content("404 Not Found", "text/plain"); - } - // for other error codes, we skip processing here because it's already done by res_error() - }); - - // set timeouts and change hostname and port - svr->set_read_timeout (params.timeout_read); - svr->set_write_timeout(params.timeout_write); - - if (params.api_keys.size() == 1) { - auto key = params.api_keys[0]; - std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); - LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str()); - } else if (params.api_keys.size() > 1) { - LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size()); - } - - // - // Middlewares - // - - auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { - static const std::unordered_set public_endpoints = { - "/health", - "/v1/health", - "/models", - "/v1/models", - "/api/tags" - }; - - // If API key is not set, skip validation - if (api_keys.empty()) { - return true; - } - - // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { - return true; - } - - // Check for API key in the header - auto auth_header = req.get_header_value("Authorization"); - - std::string prefix = "Bearer "; - if (auth_header.substr(0, prefix.size()) == prefix) { - std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) { - return true; // API key is valid - } - } - - // API key is invalid or not provided - //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - // FIXME - res.status = 401; - res.set_content("Unauthorized: Invalid API Key", "text/plain"); - - LOG_WRN("Unauthorized: Invalid API Key\n"); - - return false; - }; - - auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { - bool ready = is_ready.load(); - if (!ready) { - auto tmp = string_split(req.path, '.'); - if (req.path == "/" || tmp.back() == "html") { - res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); - res.status = 503; - } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { - // allow the models endpoint to be accessed during loading - return true; - } else { - // FIXME - //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - res.status = 503; - res.set_content("503 Service Unavailable: Loading model", "text/plain"); - } - return false; - } - return true; - }; - - // register server middlewares - svr->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - // If this is OPTIONS request, skip validation because browsers don't include Authorization header - if (req.method == "OPTIONS") { - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "GET, POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - res.set_content("", "text/html"); // blank response, no data - return httplib::Server::HandlerResponse::Handled; // skip further processing - } - if (!middleware_server_state(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - if (!middleware_validate_api_key(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - return httplib::Server::HandlerResponse::Unhandled; - }); - - int n_threads_http = params.n_threads_http; - if (n_threads_http < 1) { - // +2 threads for monitoring endpoints - n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); - svr->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); }; - - // - // Web UI setup - // - - if (!params.webui) { - LOG_INF("Web UI is disabled\n"); - } else { - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } - } else { - // using embedded static index.html - svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { - if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { - res.set_content("Error: gzip is not supported by this browser", "text/plain"); - } else { - res.set_header("Content-Encoding", "gzip"); - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); - } - return false; - }); - } - } - return true; -} - -bool server_http_context::start() { - // Bind and listen - - bool was_bound = false; - bool is_sock = false; - if (string_ends_with(std::string(hostname), ".sock")) { - is_sock = true; - LOG_INF("%s: setting address family to AF_UNIX\n", __func__); - svr->set_address_family(AF_UNIX); - // bind_to_port requires a second arg, any value other than 0 should - // simply get ignored - was_bound = svr->bind_to_port(hostname, 8080); - } else { - LOG_INF("%s: binding port with default address family\n", __func__); - // bind HTTP listen port - if (port == 0) { - int bound_port = svr->bind_to_any_port(hostname); - if ((was_bound = (bound_port >= 0))) { - port = bound_port; - } - } else { - was_bound = svr->bind_to_port(hostname, port); - } - } - - if (!was_bound) { - LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port); - return false; - } - - // run the HTTP server in a thread - thread = std::thread([this]() { svr->listen_after_bind(); }); - svr->wait_until_ready(); - - LOG_INF("%s: server is listening on %s\n", __func__, - is_sock ? string_format("unix://%s", hostname.c_str()).c_str() : - string_format("http://%s:%d", hostname.c_str(), port).c_str()); - return true; -} - -void server_http_context::stop() { - if (svr) { - svr->stop(); - } -} - -void server_http_context::get(const std::string & path, server_http_context::handler_t handler) { - svr->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { - server_http_resgen_ptr response = handler(server_http_request{ - req.path_params, - json{}, - req.is_connection_closed - }); - GGML_ASSERT(!response->is_stream() && "not supported for GET method"); - res.status = response->status; - res.set_content(response->data, response->content_type); - }); -} - -void server_http_context::post(const std::string & path, server_http_context::handler_t handler) { - svr->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { - server_http_resgen_ptr response = handler(server_http_request{ - req.path_params, - json::parse(req.body.empty() ? "{}" : req.body), - req.is_connection_closed - }); - if (response->is_stream()) { - res.status = response->status; - std::string content_type = response->content_type; - // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it - std::shared_ptr r_ptr = std::move(response); - const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { - // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed() - sink.write(response->data.data(), response->data.size()); - SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); - if (!response->next()) { - SRV_DBG("%s", "http: stream ended\n"); - sink.done(); - return false; // end of stream - } - return true; - }; - const auto on_complete = [response = r_ptr](bool) mutable { - response.reset(); // trigger the destruction of the response object - }; - res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); - } else { - res.status = response->status; - res.set_content(response->data, response->content_type); - } - }); -} diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 2b20cf0b87c99..d828c922f26c9 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -9,8 +9,6 @@ #include "mtmd-helper.h" #include "chat.h" -#include - #define JSON_ASSERT GGML_ASSERT #include @@ -21,6 +19,9 @@ #include #include +#define JSON_ASSERT GGML_ASSERT +#include + #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" using json = nlohmann::ordered_json; From 473b0e58236fb80944af18c2f12027779fb78395 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 21:59:01 +0100 Subject: [PATCH 03/21] add the remaining endpoints --- tools/server/server-http.cpp | 12 +- tools/server/server-http.h | 4 +- tools/server/server.cpp | 865 ++++++++++++++++++++++++++++++++++- 3 files changed, 860 insertions(+), 21 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 0324e8d69f96c..5b3b665f4b22f 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -80,6 +80,7 @@ bool server_http_context::init(const common_params & params) { // FIXME GGML_UNUSED(res); GGML_UNUSED(message); + printf("Exception caught in HTTP server: %s\n", message.c_str()); // try { // json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); // LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); @@ -306,7 +307,7 @@ void server_http_context::get(const std::string & path, server_http_context::han pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { server_http_resgen_ptr response = handler(server_http_request{ get_params(req), - json{}, + req.body, req.is_connection_closed }); GGML_ASSERT(!response->is_stream() && "not supported for GET method"); @@ -320,7 +321,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { server_http_resgen_ptr response = handler(server_http_request{ get_params(req), - json::parse(req.body.empty() ? "{}" : req.body), + req.body, req.is_connection_closed }); if (response->is_stream()) { @@ -330,15 +331,18 @@ void server_http_context::post(const std::string & path, server_http_context::ha // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it std::shared_ptr r_ptr = std::move(response); const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { - // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed() + // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() sink.write(response->data.data(), response->data.size()); SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); if (!response->next()) { + // flush the remaining data + sink.write(response->data.data(), response->data.size()); + SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str()); SRV_DBG("%s", "http: stream ended\n"); sink.done(); return false; // end of stream } - return true; + return true; // more data, continue the loop }; const auto on_complete = [response = r_ptr](bool) mutable { response.reset(); // trigger the destruction of the response object diff --git a/tools/server/server-http.h b/tools/server/server-http.h index b0dc510e2ec33..f449bfedc27b0 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -11,7 +11,7 @@ // this object response with one of the 2 modes: // 1) normal response: `data` contains the full response body // 2) streaming response: each call to next() generates the next chunk, stored in `data` -// when next() returns false, the stream ends +// when next() returns false, no more data after the current chunk struct server_http_resgen { std::string content_type = "application/json; charset=utf-8"; int status = 200; @@ -33,7 +33,7 @@ using server_http_resgen_ptr = std::unique_ptr; struct server_http_request { std::map params; // path_params + query_params - json body; + std::string body; const std::function & should_stop; std::string get_param(const std::string & key, const std::string & def = "") const { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index e4f0167fa838c..1f4ca79278c7d 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1665,7 +1666,7 @@ struct server_slot { server_prompt prompt; void prompt_save(server_prompt_cache & prompt_cache) const { - assert(prompt.data.size() == 0); + GGML_ASSERT(prompt.data.size() == 0); const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0); @@ -4429,8 +4430,11 @@ struct server_resgen : server_http_resgen { }; struct server_routes { + const common_params & params; server_context & ctx_server; - server_routes(server_context & ctx_server_) : ctx_server(ctx_server_) {} + server_http_context & ctx_http; // for reading is_ready + server_routes(const common_params & params, server_context & ctx_server, server_http_context & ctx_http) + : params(params), ctx_server(ctx_server), ctx_http(ctx_http) {} public: // handlers using lambda function, so that they can capture `this` without `std::bind` @@ -4442,11 +4446,352 @@ struct server_routes { return res; }; + server_http_context::handler_t get_metrics = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_metrics) { + res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // request slots data using task queue + // TODO: use server_response_reader + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); // high-priority task + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + // TODO: get rid of this dynamic_cast + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", (uint64_t) res_task->n_prompt_tokens_processed_total} + }, { + {"name", "prompt_seconds_total"}, + {"help", "Prompt process time"}, + {"value", (uint64_t) res_task->t_prompt_processing_total / 1.e3} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", (uint64_t) res_task->n_tokens_predicted_total} + }, { + {"name", "tokens_predicted_seconds_total"}, + {"help", "Predict process time"}, + {"value", (uint64_t) res_task->t_tokens_generation_total / 1.e3} + }, { + {"name", "n_decode_total"}, + {"help", "Total number of llama_decode() calls"}, + {"value", res_task->n_decode_total} + }, { + {"name", "n_tokens_max"}, + {"help", "Largest observed n_tokens."}, + {"value", res_task->n_tokens_max} + }, { + {"name", "n_busy_slots_per_decode"}, + {"help", "Average number of busy slots per llama_decode() call"}, + {"value", (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.} + },{ + {"name", "requests_processing"}, + {"help", "Number of requests processing."}, + {"value", (uint64_t) res_task->n_processing_slots} + },{ + {"name", "requests_deferred"}, + {"help", "Number of requests deferred."}, + {"value", (uint64_t) res_task->n_tasks_deferred} + }}} + }; + + std::stringstream prometheus; + + for (const auto & el : all_metrics_def.items()) { + const auto & type = el.key(); + const auto & metrics_def = el.value(); + + for (const auto & metric_def : metrics_def) { + const std::string name = metric_def.at("name"); + const std::string help = metric_def.at("help"); + + auto value = json_value(metric_def, "value", 0.); + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << value << "\n"; + } + } + + res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start); + res->content_type = "text/plain; version=0.0.4"; + res->ok(prometheus.str()); + return res; + }; + + server_http_context::handler_t get_slots = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_slots) { + res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // request slots data using task queue + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); // high-priority task + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + // TODO: get rid of this dynamic_cast + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + // optionally return "fail_on_no_slot" error + if (!req.get_param("fail_on_no_slot").empty()) { + if (res_task->n_idle_slots == 0) { + res->error(format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); + return res; + } + } + + res->ok(res_task->slots_data); + return res; + }; + + server_http_context::handler_t post_slots = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + if (params.slot_save_path.empty()) { + res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + std::string id_slot_str = req.get_param("id_slot"); + int id_slot; + + try { + id_slot = std::stoi(id_slot_str); + } catch (const std::exception &) { + res->error(format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + std::string action = req.get_param("action"); + + if (action == "save") { + return handle_slots_save(req, id_slot); + } else if (action == "restore") { + return handle_slots_restore(req, id_slot); + } else if (action == "erase") { + return handle_slots_erase(req, id_slot); + } else { + res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + }; + + server_http_context::handler_t get_props = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + json default_generation_settings_for_props; + + { + slot_params params; + + params.sampling = ctx_server.params_base.sampling; + + default_generation_settings_for_props = json { + {"params", params.to_json(true)}, + {"n_ctx", ctx_server.slots[0].n_ctx}, + }; + } + + // this endpoint is publicly available, please only return what is safe to be exposed + json data = { + { "default_generation_settings", default_generation_settings_for_props }, + { "total_slots", ctx_server.params_base.n_parallel }, + { "model_alias", ctx_server.params_base.model_alias }, + { "model_path", ctx_server.params_base.model.path }, + { "modalities", json { + {"vision", ctx_server.oai_parser_opt.allow_image}, + {"audio", ctx_server.oai_parser_opt.allow_audio}, + } }, + { "endpoint_slots", params.endpoint_slots }, + { "endpoint_props", params.endpoint_props }, + { "endpoint_metrics", params.endpoint_metrics }, + { "webui", params.webui }, + { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, + { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, + { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, + { "build_info", build_info }, + }; + if (ctx_server.params_base.use_jinja) { + if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { + data["chat_template_tool_use"] = tool_use_src; + } + } + + res->ok(data); + return res; + }; + + server_http_context::handler_t post_props = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_props) { + res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + // update any props here + + res->ok({{ "success", true }}); + return res; + }; + + server_http_context::handler_t get_api_show = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + bool has_mtmd = ctx_server.mctx != nullptr; + json data = { + { + "template", common_chat_templates_source(ctx_server.chat_templates.get()), + }, + { + "model_info", { + { "llama.context_length", ctx_server.slots.back().n_ctx, }, + } + }, + {"modelfile", ""}, + {"parameters", ""}, + {"template", common_chat_templates_source(ctx_server.chat_templates.get())}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }}, + {"model_info", ""}, + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})} + }; + + res->ok(data); + return res; + }; + + server_http_context::handler_t post_infill = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + // check model compatibility + std::string err; + if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + if (!err.empty()) { + res->error(format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // validate input + json data = json::parse(req.body); + if (data.contains("prompt") && !data.at("prompt").is_string()) { + // prompt is optional + res->error(format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_prefix")) { + res->error(format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_suffix")) { + res->error(format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + // input_extra is optional + res->error(format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + json input_extra = json_value(data, "input_extra", json::array()); + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk.at("text").is_string()) { + res->error(format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + // filename is optional + if (chunk.contains("filename") && !chunk.at("filename").is_string()) { + res->error(format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + data["input_extra"] = input_extra; // default to empty array if it's not exist + + std::string prompt = json_value(data, "prompt", std::string()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true); + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + data["prompt"] = format_infill( + ctx_server.vocab, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + ctx_server.params_base.n_batch, + ctx_server.params_base.n_predict, + ctx_server.slots[0].n_ctx, // TODO: there should be a better way + ctx_server.params_base.spm_infill, + tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal. + ); + + std::vector files; // dummy + return handle_completions_impl( + SERVER_TASK_TYPE_INFILL, + data, + files, + req.should_stop, + OAICOMPAT_TYPE_NONE); // infill is not OAI compatible + }; + server_http_context::handler_t post_completions = [this](const server_http_request & req) { std::vector files; // dummy + const json body = json::parse(req.body); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, - req.body, + body, files, req.should_stop, OAICOMPAT_TYPE_NONE); @@ -4454,9 +4799,10 @@ struct server_routes { server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) { std::vector files; // dummy + const json body = json::parse(req.body); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, - req.body, + body, files, req.should_stop, OAICOMPAT_TYPE_COMPLETION); @@ -4464,21 +4810,288 @@ struct server_routes { server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) { std::vector files; - json body_cpy = req.body; - json data = oaicompat_chat_params_parse( - body_cpy, + json body = json::parse(req.body); + json body_parsed = oaicompat_chat_params_parse( + body, ctx_server.oai_parser_opt, files); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, - body_cpy, + body_parsed, files, req.should_stop, OAICOMPAT_TYPE_CHAT); }; + // same with handle_chat_completions, but without inference part + server_http_context::handler_t post_apply_template = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + std::vector files; // dummy, unused + json body = json::parse(req.body); + json data = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + res->ok({{ "prompt", std::move(data.at("prompt")) }}); + return res; + }; + + server_http_context::handler_t get_models = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + bool is_model_ready = ctx_http.is_ready.load(); + json model_meta = nullptr; + if (is_model_ready) { + model_meta = ctx_server.model_meta(); + } + bool has_mtmd = ctx_server.mctx != nullptr; + json models = { + {"models", { + { + {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"modified_at", ""}, + {"size", ""}, + {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash + {"type", "model"}, + {"description", ""}, + {"tags", {""}}, + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}, + {"parameters", ""}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }} + } + }}, + {"object", "list"}, + {"data", { + { + {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"object", "model"}, + {"created", std::time(0)}, + {"owned_by", "llamacpp"}, + {"meta", model_meta}, + }, + }} + }; + + res->ok(models); + return res; + }; + + server_http_context::handler_t post_tokenize = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + json tokens_response = json::array(); + if (body.count("content") != 0) { + const bool add_special = json_value(body, "add_special", false); + const bool parse_special = json_value(body, "parse_special", true); + const bool with_pieces = json_value(body, "with_pieces", false); + + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = common_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + + tokens_response.push_back({ + {"id", token}, + {"piece", piece_json} + }); + } + } else { + tokens_response = tokens; + } + } + + const json data = format_tokenizer_response(tokens_response); + res->ok(data); + return res; + }; + + server_http_context::handler_t post_detokenize = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + + std::string content; + if (body.count("tokens") != 0) { + const llama_tokens tokens = body.at("tokens"); + content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); + } + + const json data = format_detokenized_response(content); + res->ok(data); + return res; + }; + + server_http_context::handler_t post_embeddings = [this](const server_http_request & req) { + return handle_embeddings_impl(req, OAICOMPAT_TYPE_NONE); + }; + + server_http_context::handler_t post_embeddings_oai = [this](const server_http_request & req) { + return handle_embeddings_impl(req, OAICOMPAT_TYPE_EMBEDDING); + }; + + server_http_context::handler_t post_rerank = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { + res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + const json body = json::parse(req.body); + + // if true, use TEI API format, otherwise use Jina API format + // Jina: https://jina.ai/reranker/ + // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank + bool is_tei_format = body.contains("texts"); + + json query; + if (body.count("query") == 1) { + query = body.at("query"); + if (!query.is_string()) { + res->error(format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } else { + res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + std::vector documents = json_value(body, "documents", + json_value(body, "texts", std::vector())); + if (documents.empty()) { + res->error(format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + int top_n = json_value(body, "top_n", (int)documents.size()); + + // create and queue the task + json responses = json::array(); + server_response_reader rd(ctx_server); + { + std::vector tasks; + tasks.reserve(documents.size()); + for (size_t i = 0; i < documents.size(); i++) { + auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); + server_task task = server_task(SERVER_TASK_TYPE_RERANK); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.tokens = std::move(tmp); + tasks.push_back(std::move(task)); + } + rd.post_tasks(std::move(tasks)); + } + + // wait for the results + auto all_results = rd.wait_for_all(req.should_stop); + + // collect results + if (all_results.is_terminated) { + return res; // connection is closed + } else if (all_results.error) { + res->error(all_results.error->to_json()); + return res; + } else { + for (auto & res : all_results.results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); + } + } + + // write JSON response + json root = format_response_rerank( + body, + responses, + is_tei_format, + documents, + top_n); + + res->ok(root); + return res; + }; + + server_http_context::handler_t get_lora_adapters = [this](const server_http_request &) { + auto res = std::make_unique(ctx_server); + json result = json::array(); + const auto & loras = ctx_server.params_base.lora_adapters; + for (size_t i = 0; i < loras.size(); ++i) { + auto & lora = loras[i]; + json entry = { + {"id", i}, + {"path", lora.path}, + {"scale", lora.scale}, + {"task_name", lora.task_name}, + {"prompt_prefix", lora.prompt_prefix}, + }; + std::string alora_invocation_string = ""; + const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); + std::vector alora_invocation_tokens; + if (n_alora_tokens) { + const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); + for (uint64_t i = 0; i < n_alora_tokens; ++i) { + alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]); + alora_invocation_tokens.push_back(alora_tokens[i]); + } + entry["alora_invocation_string"] = alora_invocation_string; + entry["alora_invocation_tokens"] = alora_invocation_tokens; + } + result.push_back(std::move(entry)); + } + res->ok(result); + return res; + }; + + server_http_context::handler_t post_lora_adapters = [this](const server_http_request & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + if (!body.is_array()) { + res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SET_LORA); + task.id = task_id; + task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; + }; + private: - server_http_resgen_ptr handle_completions_impl( + std::unique_ptr handle_completions_impl( server_task_type type, const json & data, const std::vector & files, @@ -4573,7 +5186,8 @@ struct server_routes { } // next responses are streamed - res->ok(first_result->to_json()); + res->data = format_sse(first_result->to_json()); // to be sent immediately + res->status = 200; res->content_type = "text/event-stream"; res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool { if (should_stop()) { @@ -4583,7 +5197,6 @@ struct server_routes { server_response_reader & rd = res_this->rd; std::string & output = res_this->data; - output.clear(); // receive subsequent results auto result = rd.next(should_stop); @@ -4595,7 +5208,7 @@ struct server_routes { // send the results json res_json = result->to_json(); if (result->is_error()) { - output += format_sse(res_json); + output = format_sse(res_json); SRV_DBG("%s", "error received during streaming, terminating stream\n"); return false; // terminate on error } else { @@ -4603,7 +5216,7 @@ struct server_routes { dynamic_cast(result.get()) != nullptr || dynamic_cast(result.get()) != nullptr ); - output += format_sse(res_json); + output = format_sse(res_json); } // check if there is more data @@ -4622,6 +5235,202 @@ struct server_routes { return res; } + + std::unique_ptr handle_slots_save(const server_http_request & req, int id_slot) { + auto res = std::make_unique(ctx_server); + const json & request_data = req.body; + std::string filename = request_data.at("filename"); + if (!fs_validate_filename(filename)) { + res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::string filepath = params.slot_save_path + filename; + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_SAVE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + res->ok(result->to_json()); + return res; + } + + std::unique_ptr handle_slots_restore(const server_http_request & req, int id_slot) { + auto res = std::make_unique(ctx_server); + const json & request_data = req.body; + std::string filename = request_data.at("filename"); + if (!fs_validate_filename(filename)) { + res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::string filepath = params.slot_save_path + filename; + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; + } + + std::unique_ptr handle_slots_erase(const server_http_request &, int id_slot) { + auto res = std::make_unique(ctx_server); + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_ERASE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; + } + + std::unique_ptr handle_embeddings_impl(const server_http_request & req, oaicompat_type oaicompat) { + auto res = std::make_unique(ctx_server); + if (!ctx_server.params_base.embedding) { + res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + const json body = json::parse(req.body); + + // for the shape of input/content, see tokenize_input_prompts() + json prompt; + if (body.count("input") != 0) { + prompt = body.at("input"); + } else if (body.contains("content")) { + oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible + prompt = body.at("content"); + } else { + res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + bool use_base64 = false; + if (body.count("encoding_format") != 0) { + const std::string& format = body.at("encoding_format"); + if (format == "base64") { + use_base64 = true; + } else if (format != "float") { + res->error(format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + + auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); + for (const auto & tokens : tokenized_prompts) { + // this check is necessary for models that do not add BOS token to the input + if (tokens.empty()) { + res->error(format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + + int embd_normalize = 2; // default to Euclidean/L2 norm + if (body.count("embd_normalize") != 0) { + embd_normalize = body.at("embd_normalize"); + if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx)); + } + } + + // create and queue the task + json responses = json::array(); + server_response_reader rd(ctx_server); + { + std::vector tasks; + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.tokens = std::move(tokenized_prompts[i]); + + // OAI-compat + task.params.oaicompat = oaicompat; + task.params.embd_normalize = embd_normalize; + + tasks.push_back(std::move(task)); + } + rd.post_tasks(std::move(tasks)); + } + + // wait for the results + auto all_results = rd.wait_for_all(req.should_stop); + + // collect results + if (all_results.is_terminated) { + return res; // connection is closed + } else if (all_results.error) { + res->error(all_results.error->to_json()); + return res; + } else { + for (auto & res : all_results.results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); + } + } + + // write JSON response + json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING + ? format_embeddings_response_oaicompat(body, responses, use_base64) + : json(responses); + res->ok(root); + return res; + } }; std::function shutdown_handler; @@ -4684,14 +5493,40 @@ int main(int argc, char ** argv) { // // register API routes - server_routes routes(ctx_server); - ctx_http.get ("/health", routes.get_health); + server_routes routes(params, ctx_server, ctx_http); + + ctx_http.get ("/health", routes.get_health); // public endpoint (no API key check) + ctx_http.get ("/v1/health", routes.get_health); // public endpoint (no API key check) + ctx_http.get ("/metrics", routes.get_metrics); + ctx_http.get ("/props", routes.get_props); + ctx_http.post("/props", routes.post_props); + ctx_http.post("/api/show", routes.get_api_show); + ctx_http.get ("/models", routes.get_models); // public endpoint (no API key check) + ctx_http.get ("/v1/models", routes.get_models); // public endpoint (no API key check) + ctx_http.get ("/api/tags", routes.get_models); // ollama specific endpoint. public endpoint (no API key check) ctx_http.post("/completion", routes.post_completions); // legacy ctx_http.post("/completions", routes.post_completions); ctx_http.post("/v1/completions", routes.post_completions_oai); ctx_http.post("/chat/completions", routes.post_chat_completions); ctx_http.post("/v1/chat/completions", routes.post_chat_completions); ctx_http.post("/api/chat", routes.post_chat_completions); // ollama specific endpoint + ctx_http.post("/infill", routes.post_infill); + ctx_http.post("/embedding", routes.post_embeddings); // legacy + ctx_http.post("/embeddings", routes.post_embeddings); + ctx_http.post("/v1/embeddings", routes.post_embeddings_oai); + ctx_http.post("/rerank", routes.post_rerank); + ctx_http.post("/reranking", routes.post_rerank); + ctx_http.post("/v1/rerank", routes.post_rerank); + ctx_http.post("/v1/reranking", routes.post_rerank); + ctx_http.post("/tokenize", routes.post_tokenize); + ctx_http.post("/detokenize", routes.post_detokenize); + ctx_http.post("/apply-template", routes.post_apply_template); + // LoRA adapters hotswap + ctx_http.get ("/lora-adapters", routes.get_lora_adapters); + ctx_http.post("/lora-adapters", routes.post_lora_adapters); + // Save & load slots + ctx_http.get ("/slots", routes.get_slots); + ctx_http.post("/slots/:id_slot", routes.post_slots); // // Start the server From a2e6a00371d0f860e21511f158693bf19759130d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 22:28:51 +0100 Subject: [PATCH 04/21] fix exception/error handling --- tools/server/server-http.cpp | 31 +++++++------ tools/server/server.cpp | 89 +++++++++++++++++++++++------------- 2 files changed, 74 insertions(+), 46 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 5b3b665f4b22f..d3fb13bb840be 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -68,6 +68,8 @@ bool server_http_context::init(const common_params & params) { srv->set_default_headers({{"Server", "llama.cpp"}}); srv->set_logger(log_server_request); srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { + // this is fail-safe; exceptions should already handled by `ex_wrapper` + std::string message; try { std::rethrow_exception(ep); @@ -77,26 +79,25 @@ bool server_http_context::init(const common_params & params) { message = "Unknown Exception"; } - // FIXME - GGML_UNUSED(res); - GGML_UNUSED(message); - printf("Exception caught in HTTP server: %s\n", message.c_str()); - // try { - // json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - // LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - // res_error(res, formatted_error); - // } catch (const std::exception & e) { - // LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); - // } + res.status = 500; + res.set_content(message, "text/plain"); + LOG_ERR("got exception: %s\n", message.c_str()); }); srv->set_error_handler([](const httplib::Request &, httplib::Response & res) { if (res.status == 404) { - // FIXME - //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); - res.set_content("404 Not Found", "text/plain"); + res.set_content( + (json { + {"error", { + {"message", "File Not Found"}, + {"type", "not_found_error"}, + {"code", 404} + }} + }).dump(), + "application/json; charset=utf-8" + ); } - // for other error codes, we skip processing here because it's already done by res_error() + // for other error codes, we skip processing here because it's already done by res->error() }); // set timeouts and change hostname and port diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 1f4ca79278c7d..cee4a7fa0a96b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4425,7 +4425,7 @@ struct server_resgen : server_http_resgen { } void error(const json & error_data) { status = json_value(error_data, "code", 500); - data = safe_json_to_str(error_data); + data = safe_json_to_str({{ "error", error_data }}); } }; @@ -5447,6 +5447,33 @@ inline void signal_handler(int signal) { shutdown_handler(signal); } +// wrapper function that handles exceptions and logs errors +static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { + return [func = std::move(func)](const server_http_request & req) -> server_http_resgen_ptr { + std::string message; + try { + return func(req); + } catch (const std::exception & e) { + message = e.what(); + } catch (...) { + message = "unknown error"; + } + + auto res = std::make_unique(); + res->status = 500; + try { + json error_data = format_error_response(message, ERROR_TYPE_SERVER); + res->status = json_value(error_data, "code", 500); + res->data = safe_json_to_str({{ "error", error_data }}); + LOG_WRN("got exception: %s\n", res->data.c_str()); + } catch (const std::exception & e) { + LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); + res->data = "Internal Server Error"; + } + return res; + }; +} + int main(int argc, char ** argv) { // own arguments required by this example common_params params; @@ -5495,38 +5522,38 @@ int main(int argc, char ** argv) { // register API routes server_routes routes(params, ctx_server, ctx_http); - ctx_http.get ("/health", routes.get_health); // public endpoint (no API key check) - ctx_http.get ("/v1/health", routes.get_health); // public endpoint (no API key check) - ctx_http.get ("/metrics", routes.get_metrics); - ctx_http.get ("/props", routes.get_props); - ctx_http.post("/props", routes.post_props); - ctx_http.post("/api/show", routes.get_api_show); - ctx_http.get ("/models", routes.get_models); // public endpoint (no API key check) - ctx_http.get ("/v1/models", routes.get_models); // public endpoint (no API key check) - ctx_http.get ("/api/tags", routes.get_models); // ollama specific endpoint. public endpoint (no API key check) - ctx_http.post("/completion", routes.post_completions); // legacy - ctx_http.post("/completions", routes.post_completions); - ctx_http.post("/v1/completions", routes.post_completions_oai); - ctx_http.post("/chat/completions", routes.post_chat_completions); - ctx_http.post("/v1/chat/completions", routes.post_chat_completions); - ctx_http.post("/api/chat", routes.post_chat_completions); // ollama specific endpoint - ctx_http.post("/infill", routes.post_infill); - ctx_http.post("/embedding", routes.post_embeddings); // legacy - ctx_http.post("/embeddings", routes.post_embeddings); - ctx_http.post("/v1/embeddings", routes.post_embeddings_oai); - ctx_http.post("/rerank", routes.post_rerank); - ctx_http.post("/reranking", routes.post_rerank); - ctx_http.post("/v1/rerank", routes.post_rerank); - ctx_http.post("/v1/reranking", routes.post_rerank); - ctx_http.post("/tokenize", routes.post_tokenize); - ctx_http.post("/detokenize", routes.post_detokenize); - ctx_http.post("/apply-template", routes.post_apply_template); + ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); + ctx_http.get ("/props", ex_wrapper(routes.get_props)); + ctx_http.post("/props", ex_wrapper(routes.post_props)); + ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); + ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) + ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy + ctx_http.post("/completions", ex_wrapper(routes.post_completions)); + ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); + ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/infill", ex_wrapper(routes.post_infill)); + ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy + ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); + ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); + ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); + ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); + ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); // LoRA adapters hotswap - ctx_http.get ("/lora-adapters", routes.get_lora_adapters); - ctx_http.post("/lora-adapters", routes.post_lora_adapters); + ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); + ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); // Save & load slots - ctx_http.get ("/slots", routes.get_slots); - ctx_http.post("/slots/:id_slot", routes.post_slots); + ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); + ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); // // Start the server From 66c6fe27446062e3d9b84783a6053e13c95af93f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 22:35:35 +0100 Subject: [PATCH 05/21] renaming --- tools/server/server-http.cpp | 6 +-- tools/server/server-http.h | 10 ++-- tools/server/server.cpp | 98 ++++++++++++++++++------------------ 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index d3fb13bb840be..ee25ec6c939be 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -306,7 +306,7 @@ static std::map get_params(const httplib::Request & re void server_http_context::get(const std::string & path, server_http_context::handler_t handler) { pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { - server_http_resgen_ptr response = handler(server_http_request{ + server_http_res_ptr response = handler(server_http_req{ get_params(req), req.body, req.is_connection_closed @@ -320,7 +320,7 @@ void server_http_context::get(const std::string & path, server_http_context::han void server_http_context::post(const std::string & path, server_http_context::handler_t handler) { pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { - server_http_resgen_ptr response = handler(server_http_request{ + server_http_res_ptr response = handler(server_http_req{ get_params(req), req.body, req.is_connection_closed @@ -330,7 +330,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha set_headers(res, response->headers); std::string content_type = response->content_type; // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it - std::shared_ptr r_ptr = std::move(response); + std::shared_ptr r_ptr = std::move(response); const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() sink.write(response->data.data(), response->data.size()); diff --git a/tools/server/server-http.h b/tools/server/server-http.h index f449bfedc27b0..b518586047eeb 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -12,7 +12,7 @@ // 1) normal response: `data` contains the full response body // 2) streaming response: each call to next() generates the next chunk, stored in `data` // when next() returns false, no more data after the current chunk -struct server_http_resgen { +struct server_http_res { std::string content_type = "application/json; charset=utf-8"; int status = 200; std::string data; @@ -24,14 +24,14 @@ struct server_http_resgen { return next != nullptr; } - virtual ~server_http_resgen() = default; + virtual ~server_http_res() = default; }; // unique pointer, used by set_chunked_content_provider // we need to use unique_ptr because httplib requires the stream provider to be stored in heap -using server_http_resgen_ptr = std::unique_ptr; +using server_http_res_ptr = std::unique_ptr; -struct server_http_request { +struct server_http_req { std::map params; // path_params + query_params std::string body; const std::function & should_stop; @@ -63,7 +63,7 @@ struct server_http_context { bool start(); void stop(); - using handler_t = std::function; + using handler_t = std::function; void get(const std::string &, handler_t); void post(const std::string &, handler_t); }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index cee4a7fa0a96b..6fdead9756ae6 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4416,9 +4416,9 @@ struct server_response_reader { }; // generator-like API for HTTP response generation -struct server_resgen : server_http_resgen { +struct server_res_generator : server_http_res { server_response_reader rd; - server_resgen(server_context & ctx_server_) : rd(ctx_server_) {} + server_res_generator(server_context & ctx_server_) : rd(ctx_server_) {} void ok(const json & response_data) { status = 200; data = safe_json_to_str(response_data); @@ -4439,15 +4439,15 @@ struct server_routes { public: // handlers using lambda function, so that they can capture `this` without `std::bind` - server_http_context::handler_t get_health = [this](const server_http_request &) { + server_http_context::handler_t get_health = [this](const server_http_req &) { // error and loading states are handled by middleware - auto res = std::make_unique(ctx_server); + auto res = std::make_unique(ctx_server); res->ok({{"status", "ok"}}); return res; }; - server_http_context::handler_t get_metrics = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_metrics = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); if (!params.endpoint_metrics) { res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -4549,8 +4549,8 @@ struct server_routes { return res; }; - server_http_context::handler_t get_slots = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_slots = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); if (!params.endpoint_slots) { res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -4590,8 +4590,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_slots = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_slots = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); if (params.slot_save_path.empty()) { res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -4621,8 +4621,8 @@ struct server_routes { } }; - server_http_context::handler_t get_props = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_props = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); json default_generation_settings_for_props; { @@ -4665,8 +4665,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_props = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_props = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); if (!params.endpoint_props) { res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -4677,8 +4677,8 @@ struct server_routes { return res; }; - server_http_context::handler_t get_api_show = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_api_show = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); bool has_mtmd = ctx_server.mctx != nullptr; json data = { { @@ -4708,8 +4708,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_infill = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_infill = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); // check model compatibility std::string err; if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { @@ -4786,7 +4786,7 @@ struct server_routes { OAICOMPAT_TYPE_NONE); // infill is not OAI compatible }; - server_http_context::handler_t post_completions = [this](const server_http_request & req) { + server_http_context::handler_t post_completions = [this](const server_http_req & req) { std::vector files; // dummy const json body = json::parse(req.body); return handle_completions_impl( @@ -4797,7 +4797,7 @@ struct server_routes { OAICOMPAT_TYPE_NONE); }; - server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) { + server_http_context::handler_t post_completions_oai = [this](const server_http_req & req) { std::vector files; // dummy const json body = json::parse(req.body); return handle_completions_impl( @@ -4808,7 +4808,7 @@ struct server_routes { OAICOMPAT_TYPE_COMPLETION); }; - server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) { + server_http_context::handler_t post_chat_completions = [this](const server_http_req & req) { std::vector files; json body = json::parse(req.body); json body_parsed = oaicompat_chat_params_parse( @@ -4824,8 +4824,8 @@ struct server_routes { }; // same with handle_chat_completions, but without inference part - server_http_context::handler_t post_apply_template = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_apply_template = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); std::vector files; // dummy, unused json body = json::parse(req.body); json data = oaicompat_chat_params_parse( @@ -4836,8 +4836,8 @@ struct server_routes { return res; }; - server_http_context::handler_t get_models = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_models = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); bool is_model_ready = ctx_http.is_ready.load(); json model_meta = nullptr; if (is_model_ready) { @@ -4883,8 +4883,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_tokenize = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_tokenize = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); const json body = json::parse(req.body); json tokens_response = json::array(); if (body.count("content") != 0) { @@ -4925,8 +4925,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_detokenize = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_detokenize = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); const json body = json::parse(req.body); std::string content; @@ -4940,16 +4940,16 @@ struct server_routes { return res; }; - server_http_context::handler_t post_embeddings = [this](const server_http_request & req) { + server_http_context::handler_t post_embeddings = [this](const server_http_req & req) { return handle_embeddings_impl(req, OAICOMPAT_TYPE_NONE); }; - server_http_context::handler_t post_embeddings_oai = [this](const server_http_request & req) { + server_http_context::handler_t post_embeddings_oai = [this](const server_http_req & req) { return handle_embeddings_impl(req, OAICOMPAT_TYPE_EMBEDDING); }; - server_http_context::handler_t post_rerank = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_rerank = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -5028,8 +5028,8 @@ struct server_routes { return res; }; - server_http_context::handler_t get_lora_adapters = [this](const server_http_request &) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t get_lora_adapters = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); json result = json::array(); const auto & loras = ctx_server.params_base.lora_adapters; for (size_t i = 0; i < loras.size(); ++i) { @@ -5059,8 +5059,8 @@ struct server_routes { return res; }; - server_http_context::handler_t post_lora_adapters = [this](const server_http_request & req) { - auto res = std::make_unique(ctx_server); + server_http_context::handler_t post_lora_adapters = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); const json body = json::parse(req.body); if (!body.is_array()) { res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); @@ -5091,7 +5091,7 @@ struct server_routes { }; private: - std::unique_ptr handle_completions_impl( + std::unique_ptr handle_completions_impl( server_task_type type, const json & data, const std::vector & files, @@ -5099,7 +5099,7 @@ struct server_routes { oaicompat_type oaicompat) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); - auto res = std::make_unique(ctx_server); + auto res = std::make_unique(ctx_server); auto completion_id = gen_chatcmplid(); auto & rd = res->rd; @@ -5236,8 +5236,8 @@ struct server_routes { return res; } - std::unique_ptr handle_slots_save(const server_http_request & req, int id_slot) { - auto res = std::make_unique(ctx_server); + std::unique_ptr handle_slots_save(const server_http_req & req, int id_slot) { + auto res = std::make_unique(ctx_server); const json & request_data = req.body; std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { @@ -5271,8 +5271,8 @@ struct server_routes { return res; } - std::unique_ptr handle_slots_restore(const server_http_request & req, int id_slot) { - auto res = std::make_unique(ctx_server); + std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot) { + auto res = std::make_unique(ctx_server); const json & request_data = req.body; std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { @@ -5307,8 +5307,8 @@ struct server_routes { return res; } - std::unique_ptr handle_slots_erase(const server_http_request &, int id_slot) { - auto res = std::make_unique(ctx_server); + std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot) { + auto res = std::make_unique(ctx_server); int task_id = ctx_server.queue_tasks.get_new_id(); { server_task task(SERVER_TASK_TYPE_SLOT_ERASE); @@ -5333,8 +5333,8 @@ struct server_routes { return res; } - std::unique_ptr handle_embeddings_impl(const server_http_request & req, oaicompat_type oaicompat) { - auto res = std::make_unique(ctx_server); + std::unique_ptr handle_embeddings_impl(const server_http_req & req, oaicompat_type oaicompat) { + auto res = std::make_unique(ctx_server); if (!ctx_server.params_base.embedding) { res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return res; @@ -5449,7 +5449,7 @@ inline void signal_handler(int signal) { // wrapper function that handles exceptions and logs errors static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { - return [func = std::move(func)](const server_http_request & req) -> server_http_resgen_ptr { + return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr { std::string message; try { return func(req); @@ -5459,7 +5459,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t message = "unknown error"; } - auto res = std::make_unique(); + auto res = std::make_unique(); res->status = 500; try { json error_data = format_error_response(message, ERROR_TYPE_SERVER); From 92a150f9eadf856ca3333a78fefa87cd1eafb2dd Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 22:43:45 +0100 Subject: [PATCH 06/21] missing header --- tools/server/server-http.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/server-http.h b/tools/server/server-http.h index b518586047eeb..a29c566170a36 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -6,6 +6,7 @@ #include #include #include +#include // generator-like API for HTTP response generation // this object response with one of the 2 modes: From d990534e774b7a29b190fc9e736d4f35b5265398 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 23:22:43 +0100 Subject: [PATCH 07/21] fix missing windows header --- tools/server/server.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6fdead9756ae6..527801084cb13 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2,6 +2,13 @@ #include "utils.hpp" #include "server-http.h" +#ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#endif + #include "arg.h" #include "common.h" #include "json-schema-to-grammar.h" From f428fe5841ea4c50233290d1c94268e13fd7f17e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 12 Nov 2025 23:54:12 +0100 Subject: [PATCH 08/21] fix error responses from http layer --- tools/server/server-http.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index ee25ec6c939be..8ed66215269e1 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -147,10 +147,17 @@ bool server_http_context::init(const common_params & params) { } // API key is invalid or not provided - //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - // FIXME res.status = 401; - res.set_content("Unauthorized: Invalid API Key", "text/plain"); + res.set_content( + (json { + {"error", { + {"message", "Invalid API Key"}, + {"type", "authentication_error"}, + {"code", 401} + }} + }).dump(), + "application/json; charset=utf-8" + ); LOG_WRN("Unauthorized: Invalid API Key\n"); @@ -168,10 +175,17 @@ bool server_http_context::init(const common_params & params) { // allow the models endpoint to be accessed during loading return true; } else { - // FIXME - //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); res.status = 503; - res.set_content("503 Service Unavailable: Loading model", "text/plain"); + res.set_content( + (json { + {"error", { + {"message", "Loading model"}, + {"type", "unavailable_error"}, + {"code", 503} + }} + }).dump(), + "application/json; charset=utf-8" + ); } return false; } From 25cc7eb6fb3ba51674fb6cb350c3afd1ed22eaf9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 00:00:08 +0100 Subject: [PATCH 09/21] fix slot save/restore handler --- tools/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 527801084cb13..6770dff940b0b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5245,7 +5245,7 @@ struct server_routes { std::unique_ptr handle_slots_save(const server_http_req & req, int id_slot) { auto res = std::make_unique(ctx_server); - const json & request_data = req.body; + const json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); @@ -5280,7 +5280,7 @@ struct server_routes { std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot) { auto res = std::make_unique(ctx_server); - const json & request_data = req.body; + const json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); From 3be8a3ac0c6c65e5b458d05d82c9889af603ffeb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 09:35:14 +0100 Subject: [PATCH 10/21] fix case where only one stream chunk is returned --- tools/server/server.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6770dff940b0b..b555329878706 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5205,6 +5205,17 @@ struct server_routes { server_response_reader & rd = res_this->rd; std::string & output = res_this->data; + // check if there is more data + if (!rd.has_next()) { + if (oaicompat != OAICOMPAT_TYPE_NONE) { + output = "data: [DONE]\n\n"; + } else { + output = ""; + } + SRV_DBG("%s", "all results received, terminating stream\n"); + return false; // no more data, terminate + } + // receive subsequent results auto result = rd.next(should_stop); if (result == nullptr) { @@ -5215,7 +5226,7 @@ struct server_routes { // send the results json res_json = result->to_json(); if (result->is_error()) { - output = format_sse(res_json); + output = format_sse(json {{ "error", res_json }}); SRV_DBG("%s", "error received during streaming, terminating stream\n"); return false; // terminate on error } else { @@ -5226,15 +5237,6 @@ struct server_routes { output = format_sse(res_json); } - // check if there is more data - if (!rd.has_next()) { - if (oaicompat != OAICOMPAT_TYPE_NONE) { - output += "data: [DONE]\n\n"; - } - SRV_DBG("%s", "all results received, terminating stream\n"); - return false; // no more data, terminate - } - // has next data, continue return true; }; From 9917e044f09e732a03cafeb11f713c29e7403de4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 09:37:05 +0100 Subject: [PATCH 11/21] add NOMINMAX --- tools/server/server.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b555329878706..dfd1fbc727b50 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2,9 +2,11 @@ #include "utils.hpp" #include "server-http.h" -#ifdef _WIN32 -#ifndef WIN32_LEAN_AND_MEAN +// fix problem with std::min and std::max +#if defined(_WIN32) #define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX #endif #include #endif From fc35e9165106e30a25fb00e6fb8fabb6b438097e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 10:08:13 +0100 Subject: [PATCH 12/21] do not call sink.write on empty data --- tools/server/server-http.cpp | 15 ++++++++++----- tools/server/server-http.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 8ed66215269e1..e11c5a6955aa8 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -347,12 +347,17 @@ void server_http_context::post(const std::string & path, server_http_context::ha std::shared_ptr r_ptr = std::move(response); const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() - sink.write(response->data.data(), response->data.size()); - SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); - if (!response->next()) { - // flush the remaining data + if (!response->data.empty()) { sink.write(response->data.data(), response->data.size()); - SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str()); + SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); + } + // get the next chunk + if (!response->next()) { + // flush the remaining data before ending the stream + if (!response->data.empty()) { + sink.write(response->data.data(), response->data.size()); + SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str()); + } SRV_DBG("%s", "http: stream ended\n"); sink.done(); return false; // end of stream diff --git a/tools/server/server-http.h b/tools/server/server-http.h index a29c566170a36..835a0c27098b9 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -13,6 +13,7 @@ // 1) normal response: `data` contains the full response body // 2) streaming response: each call to next() generates the next chunk, stored in `data` // when next() returns false, no more data after the current chunk +// note: some chunks can be empty, in which case no data is sent for that chunk struct server_http_res { std::string content_type = "application/json; charset=utf-8"; int status = 200; From 8c7fbec4a462cd672a0f13c5170d6dbe89fcbe47 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 10:23:30 +0100 Subject: [PATCH 13/21] use safe_json_to_str for SSE --- tools/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index d828c922f26c9..9ad6a070e31ad 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -460,7 +460,7 @@ static std::string format_sse(const json & data) { std::ostringstream ss; auto send_single = [&ss](const json & data) { ss << "data: " << - data.dump(-1, ' ', false, json::error_handler_t::replace) << + safe_json_to_str(data) << "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). }; From da458d68d9104ca5c038cd84258e9ce8b4c15de3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 10:29:40 +0100 Subject: [PATCH 14/21] clean up --- tools/server/utils.hpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 9ad6a070e31ad..bf21726051e55 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -19,9 +19,6 @@ #include #include -#define JSON_ASSERT GGML_ASSERT -#include - #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" using json = nlohmann::ordered_json; @@ -427,6 +424,10 @@ static std::string gen_tool_call_id() { // other common utils // +static std::string safe_json_to_str(const json & data) { + return data.dump(-1, ' ', false, json::error_handler_t::replace); +} + // TODO: reuse llama_detokenize template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { @@ -951,10 +952,6 @@ static json format_logit_bias(const std::vector & logit_bias) return data; } -static std::string safe_json_to_str(const json & data) { - return data.dump(-1, ' ', false, json::error_handler_t::replace); -} - static std::vector get_token_probabilities(llama_context * ctx, int idx) { std::vector cur; const auto * logits = llama_get_logits_ith(ctx, idx); From cd1047000b61678086247ea3218d472eb4dde194 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 11:36:54 +0100 Subject: [PATCH 15/21] add some comments --- tools/server/server-http.cpp | 12 ++++++------ tools/server/server-http.h | 3 ++- tools/server/server.cpp | 5 ++++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index e11c5a6955aa8..7e49b9954a9a9 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -87,13 +87,13 @@ bool server_http_context::init(const common_params & params) { srv->set_error_handler([](const httplib::Request &, httplib::Response & res) { if (res.status == 404) { res.set_content( - (json { + safe_json_to_str(json { {"error", { {"message", "File Not Found"}, {"type", "not_found_error"}, {"code", 404} }} - }).dump(), + }), "application/json; charset=utf-8" ); } @@ -149,13 +149,13 @@ bool server_http_context::init(const common_params & params) { // API key is invalid or not provided res.status = 401; res.set_content( - (json { + safe_json_to_str(json { {"error", { {"message", "Invalid API Key"}, {"type", "authentication_error"}, {"code", 401} }} - }).dump(), + }), "application/json; charset=utf-8" ); @@ -177,13 +177,13 @@ bool server_http_context::init(const common_params & params) { } else { res.status = 503; res.set_content( - (json { + safe_json_to_str(json { {"error", { {"message", "Loading model"}, {"type", "unavailable_error"}, {"code", 503} }} - }).dump(), + }), "application/json; charset=utf-8" ); } diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 835a0c27098b9..b2344b0c4d53f 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -30,7 +30,7 @@ struct server_http_res { }; // unique pointer, used by set_chunked_content_provider -// we need to use unique_ptr because httplib requires the stream provider to be stored in heap +// httplib requires the stream provider to be stored in heap using server_http_res_ptr = std::unique_ptr; struct server_http_req { @@ -65,6 +65,7 @@ struct server_http_context { bool start(); void stop(); + // note: the handler should never throw exceptions using handler_t = std::function; void get(const std::string &, handler_t); void post(const std::string &, handler_t); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index dfd1fbc727b50..b4149d093d9bb 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5459,6 +5459,7 @@ inline void signal_handler(int signal) { } // wrapper function that handles exceptions and logs errors +// this is to make sure handler_t never throws exceptions; instead, it returns an error response static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr { std::string message; @@ -5638,7 +5639,9 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.start_loop(); clean_up(); - ctx_http.thread.join(); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); + } llama_memory_breakdown_print(ctx_server.ctx); return 0; From 1bc41f60a841d6d62d555301a054e4710c9f945a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 14 Nov 2025 15:15:38 +0100 Subject: [PATCH 16/21] improve usage of next() --- tools/server/server-http.cpp | 23 +++++++++-------------- tools/server/server-http.h | 6 +++--- tools/server/server.cpp | 27 +++++++++++++++++---------- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 7e49b9954a9a9..8200dbc7a1719 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -346,23 +346,18 @@ void server_http_context::post(const std::string & path, server_http_context::ha // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it std::shared_ptr r_ptr = std::move(response); const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { - // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() - if (!response->data.empty()) { - sink.write(response->data.data(), response->data.size()); - SRV_DBG("http: streamed chunk: %s\n", response->data.c_str()); + std::string chunk; + bool has_next = response->next(chunk); + if (!chunk.empty()) { + // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() + sink.write(chunk.data(), chunk.size()); + SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); } - // get the next chunk - if (!response->next()) { - // flush the remaining data before ending the stream - if (!response->data.empty()) { - sink.write(response->data.data(), response->data.size()); - SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str()); - } - SRV_DBG("%s", "http: stream ended\n"); + if (!has_next) { sink.done(); - return false; // end of stream + SRV_DBG("%s", "http: stream ended\n"); } - return true; // more data, continue the loop + return has_next; }; const auto on_complete = [response = r_ptr](bool) mutable { response.reset(); // trigger the destruction of the response object diff --git a/tools/server/server-http.h b/tools/server/server-http.h index b2344b0c4d53f..71fd5c83808de 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -11,8 +11,8 @@ // generator-like API for HTTP response generation // this object response with one of the 2 modes: // 1) normal response: `data` contains the full response body -// 2) streaming response: each call to next() generates the next chunk, stored in `data` -// when next() returns false, no more data after the current chunk +// 2) streaming response: each call to next(output) generates the next chunk +// when next(output) returns false, no more data after the current chunk // note: some chunks can be empty, in which case no data is sent for that chunk struct server_http_res { std::string content_type = "application/json; charset=utf-8"; @@ -21,7 +21,7 @@ struct server_http_res { std::map headers; // TODO: move this to a virtual function once we have proper polymorphism support - std::function next = nullptr; + std::function next = nullptr; bool is_stream() const { return next != nullptr; } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 1971e9777b1a2..9d41f16aec1d2 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5202,14 +5202,20 @@ struct server_routes { res->data = format_sse(first_result->to_json()); // to be sent immediately res->status = 200; res->content_type = "text/event-stream"; - res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool { + res->next = [res_this = res.get(), oaicompat, &should_stop](std::string & output) -> bool { if (should_stop()) { SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); return false; // should_stop condition met } + if (!res_this->data.empty()) { + // flush the first chunk + output = std::move(res_this->data); + res_this->data.clear(); + return true; + } + server_response_reader & rd = res_this->rd; - std::string & output = res_this->data; // check if there is more data if (!rd.has_next()) { @@ -5583,14 +5589,21 @@ int main(int argc, char ** argv) { llama_backend_free(); }; - LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); + // start the HTTP server before loading the model to be able to serve /health requests + if (!ctx_http.start()) { + clean_up(); + LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + return 1; + } // load the model LOG_INF("%s: loading model\n", __func__); if (!ctx_server.load_model(params)) { clean_up(); - ctx_http.thread.join(); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); + } LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; } @@ -5632,12 +5645,6 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - if (!ctx_http.start()) { - clean_up(); - LOG_ERR("%s: exiting due to HTTP server error\n", __func__); - return 1; - } - LOG_INF("%s: starting the main loop...\n", __func__); // this call blocks the main thread until queue_tasks.terminate() is called ctx_server.queue_tasks.start_loop(); From 55ccf46b931088406eccd07b37647fa0e60e11f9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 14 Nov 2025 18:17:40 +0100 Subject: [PATCH 17/21] bring back the "server is listening on" message --- tools/server/server-http.cpp | 5 ++--- tools/server/server-http.h | 3 +++ tools/server/server.cpp | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 8200dbc7a1719..9a4e3f16642d4 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -289,9 +289,8 @@ bool server_http_context::start() { thread = std::thread([this]() { pimpl->srv->listen_after_bind(); }); srv->wait_until_ready(); - LOG_INF("%s: server is listening on %s\n", __func__, - is_sock ? string_format("unix://%s", hostname.c_str()).c_str() : - string_format("http://%s:%d", hostname.c_str(), port).c_str()); + listening_address = is_sock ? string_format("unix://%s", hostname.c_str()) + : string_format("http://%s:%d", hostname.c_str(), port); return true; } diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 71fd5c83808de..39713c021b84e 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -69,4 +69,7 @@ struct server_http_context { using handler_t = std::function; void get(const std::string &, handler_t); void post(const std::string &, handler_t); + + // for debugging + std::string listening_address; }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9d41f16aec1d2..9f1a788aaf466 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5645,6 +5645,7 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); LOG_INF("%s: starting the main loop...\n", __func__); // this call blocks the main thread until queue_tasks.terminate() is called ctx_server.queue_tasks.start_loop(); From 4d37cee576e390597c188fab5d076cf6d30ec642 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 15 Nov 2025 21:17:36 +0100 Subject: [PATCH 18/21] more generic handler --- tools/server/server-http.cpp | 67 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 9a4e3f16642d4..b9e4f03b28727 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -317,6 +317,38 @@ static std::map get_params(const httplib::Request & re return params; } +static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) { + if (response->is_stream()) { + res.status = response->status; + set_headers(res, response->headers); + std::string content_type = response->content_type; + // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it + std::shared_ptr r_ptr = std::move(response); + const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { + std::string chunk; + bool has_next = response->next(chunk); + if (!chunk.empty()) { + // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() + sink.write(chunk.data(), chunk.size()); + SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); + } + if (!has_next) { + sink.done(); + SRV_DBG("%s", "http: stream ended\n"); + } + return has_next; + }; + const auto on_complete = [response = r_ptr](bool) mutable { + response.reset(); // trigger the destruction of the response object + }; + res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); + } else { + res.status = response->status; + set_headers(res, response->headers); + res.set_content(response->data, response->content_type); + } +} + void server_http_context::get(const std::string & path, server_http_context::handler_t handler) { pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { server_http_res_ptr response = handler(server_http_req{ @@ -324,10 +356,7 @@ void server_http_context::get(const std::string & path, server_http_context::han req.body, req.is_connection_closed }); - GGML_ASSERT(!response->is_stream() && "not supported for GET method"); - res.status = response->status; - set_headers(res, response->headers); - res.set_content(response->data, response->content_type); + process_handler_response(response, res); }); } @@ -338,35 +367,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha req.body, req.is_connection_closed }); - if (response->is_stream()) { - res.status = response->status; - set_headers(res, response->headers); - std::string content_type = response->content_type; - // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it - std::shared_ptr r_ptr = std::move(response); - const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { - std::string chunk; - bool has_next = response->next(chunk); - if (!chunk.empty()) { - // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() - sink.write(chunk.data(), chunk.size()); - SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); - } - if (!has_next) { - sink.done(); - SRV_DBG("%s", "http: stream ended\n"); - } - return has_next; - }; - const auto on_complete = [response = r_ptr](bool) mutable { - response.reset(); // trigger the destruction of the response object - }; - res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); - } else { - res.status = response->status; - set_headers(res, response->headers); - res.set_content(response->data, response->content_type); - } + process_handler_response(response, res); }); } From 68d5c6f8c5faf23336c0954a17a34efcccb3f163 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 15 Nov 2025 21:29:28 +0100 Subject: [PATCH 19/21] add req.headers --- tools/server/server-http.cpp | 10 ++++++++++ tools/server/server-http.h | 1 + 2 files changed, 11 insertions(+) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index b9e4f03b28727..e2f91467faf96 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -317,6 +317,14 @@ static std::map get_params(const httplib::Request & re return params; } +static std::map get_headers(const httplib::Request & req) { + std::map headers; + for (const auto & [key, value] : req.headers) { + headers[key] = value; + } + return headers; +} + static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) { if (response->is_stream()) { res.status = response->status; @@ -353,6 +361,7 @@ void server_http_context::get(const std::string & path, server_http_context::han pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { server_http_res_ptr response = handler(server_http_req{ get_params(req), + get_headers(req), req.body, req.is_connection_closed }); @@ -364,6 +373,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { server_http_res_ptr response = handler(server_http_req{ get_params(req), + get_headers(req), req.body, req.is_connection_closed }); diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 39713c021b84e..2e632e666d8b0 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -35,6 +35,7 @@ using server_http_res_ptr = std::unique_ptr; struct server_http_req { std::map params; // path_params + query_params + std::map headers; std::string body; const std::function & should_stop; From 2c9fe91e23e94d5a12989e7cca4b39f68f13c36e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 15 Nov 2025 21:30:43 +0100 Subject: [PATCH 20/21] move the chat template print to init() --- tools/server/server.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9f1a788aaf466..1c9e9a58d7daf 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2605,6 +2605,11 @@ struct server_context { /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, /* enable_thinking */ enable_thinking, }; + + // print sample chat example to make it clear which template is used + LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, + common_chat_templates_source(chat_templates.get()), + common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); } server_slot * get_slot_by_id(int id) { @@ -5613,11 +5618,6 @@ int main(int argc, char ** argv) { LOG_INF("%s: model loaded\n", __func__); - // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, - common_chat_templates_source(ctx_server.chat_templates.get()), - common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, ctx_server.params_base.default_template_kwargs).c_str()); - ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) { ctx_server.process_single_task(std::move(task)); }); From 016f8b40e3fc92f10dc1305a0df74a29da0dd32d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 15 Nov 2025 22:05:40 +0100 Subject: [PATCH 21/21] add req.path --- tools/server/server-http.cpp | 2 ++ tools/server/server-http.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index e2f91467faf96..196ced443261a 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -362,6 +362,7 @@ void server_http_context::get(const std::string & path, server_http_context::han server_http_res_ptr response = handler(server_http_req{ get_params(req), get_headers(req), + req.path, req.body, req.is_connection_closed }); @@ -374,6 +375,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha server_http_res_ptr response = handler(server_http_req{ get_params(req), get_headers(req), + req.path, req.body, req.is_connection_closed }); diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 2e632e666d8b0..dc6ca92fd8751 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -35,7 +35,8 @@ using server_http_res_ptr = std::unique_ptr; struct server_http_req { std::map params; // path_params + query_params - std::map headers; + std::map headers; // reserved for future use + std::string path; // reserved for future use std::string body; const std::function & should_stop;